-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathmake_cc12m_train_json.py
32 lines (27 loc) · 1.02 KB
/
make_cc12m_train_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
captions = []
urls = []
with open('cc12m.tsv') as fp:
for cnt, line in enumerate(fp):
s = line.split('\t')
captions.append(s[0].split(' '))
urls.append(s[1][:-1])
valids = set([])
with open('train_valid.txt') as fp:
for cnt, line in enumerate(fp):
valids.add(line[:-1])
import json
with open('train.json', 'w') as outfile:
for cnt, (cap, url) in enumerate(zip(captions, urls)):
im = "{:08d}.jpg".format(cnt)
if (im in valids):
d = {'image':"train_image.zip@/{}".format(im), 'caption':cap}
json.dump(d, outfile)
outfile.write('\n')
import json
with open('train_frcnn.json', 'w') as outfile:
for cnt, (cap, url) in enumerate(zip(captions, urls)):
im = "{:08d}.jpg".format(cnt)
if (im in valids):
d = {'image':"train_image.zip@/{}".format(im), 'caption':cap, 'frcnn':"train_frcnn.zip@/{:08d}.json".format(cnt)}
json.dump(d, outfile)
outfile.write('\n')