forked from elleryjsmith/UCLMCTest
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcategorise.py
45 lines (30 loc) · 1.14 KB
/
categorise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import glob
import json
def loadcategories(dataset):
cats = glob.glob("categories/" + dataset + "/*.txt")
cts = dict()
for cat in cats:
with open(cat,"r") as fl:
nm = cat.split("/")[2].split(".")[0].replace("|","/")
cts[nm] = []
for ln in fl:
ps = ln.replace("\n","").split(".")[2].split(",")
cts[nm].append((int(ps[0]) * 4) + int(ps[1]))
return cts
def loadstories(dataset):
with open("datasets/" + dataset + ".json","r") as fl:
return json.load(fl)
if __name__ == "__main__":
datasets = ["mc" + d + "." + t
for d in ["160","500"]
for t in ["dev","train","test"]]
for dataset in datasets:
stories = loadstories(dataset)
categories = loadcategories(dataset)
for story in stories:
story["categories"] = [[],[],[],[]]
for category in categories:
for i in categories[category]:
stories[i / 4]["categories"][i % 4].append(category)
with open("datasets/" + dataset + ".json","w") as fl:
json.dump(stories,fl)