-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathrnn_w2v.py
42 lines (34 loc) · 1.03 KB
/
rnn_w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from gensim.models import word2vec
import glob
import json
import csv
import random
import codecs
from config import Config
config = Config()
train_paths = glob.glob(config.train_jsons+"/*.json")
test_paths = glob.glob(config.test_jsons+"/*.json")
paths = train_paths + test_paths
raw_sentences = []
for path in paths:
with open(path, 'rb') as f:
data = json.loads(f.read())
h = data["imageHeight"]
w = data["imageWidth"]
shapes = data["shapes"]
for i, obj in enumerate(shapes):
label = obj["label"].replace(" ","")
raw_sentences.append([str(x).lower() for x in label if len(x)>0])
sentences = [s for s in raw_sentences if len(s)>0]
print(sentences)
model = word2vec.Word2Vec(sentences, min_count=1)
vector = []
for each in model.wv.index2word:
line=list(model[each])
lines=[str(i) for i in line]
linestr=' '.join(lines)
L=each+' '+linestr
vector.append(L)
vect='\n'.join(vector)
ff=codecs.open(config.w2v_path,'w+',encoding='utf-8')
ff.write(vect)