-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_j2e-dict_edict.py
49 lines (41 loc) · 1.36 KB
/
create_j2e-dict_edict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# coding: UTF-8
# usage:
# python create_j2e-dict_edict.py ~/src/corpus/EDICT/edict.tok.ja ~/src/corpus/EDICT/edict.tok.en ~/src/corpus/EDICT/j2e_dict.json
import sys
import json
import codecs
args = sys.argv
j2e_name = args[3]
j2e = {}
j2e_pruned = {}
prune_rate = 0.1
for ja_line, en_line in zip(codecs.open(args[1], 'r', 'utf-8'), codecs.open(args[2], 'r', 'utf-8')):
ja_line = ja_line.strip()
en_line = en_line.strip()
ja_words = ja_line.split(" ")
en_words = en_line.split(" ")
for j_w in ja_words:
for e_w in en_words:
if j_w not in j2e:
j2e[j_w] = {e_w : 1}
else:
if e_w not in j2e[j_w]:
j2e[j_w][e_w] = 1
else:
j2e[j_w][e_w] += 1
print(type(j2e.items()[0][0]))
for j, e in j2e.items()[:200]:
print("{} : {}".format(j.encode('utf-8'), e))
for j_w, e_dict in j2e.items():
e_sum = 0
for e_cnt in e_dict.values():
e_sum += e_cnt
j2e_pruned[j_w] = [e_w for e_w, e_cnt in e_dict.items() if e_cnt > e_sum * prune_rate]
if len(j2e_pruned[j_w]) == 0:
del j2e_pruned[j_w]
print(len(j2e_pruned))
for j, e in j2e_pruned.items()[:200]:
print("{} : {}".format(j.encode('utf-8'), e))
f = codecs.open(j2e_name, "w", "utf-8")
json.dump(j2e_pruned, f, indent=2, sort_keys=True, ensure_ascii=False)
f.close()