forked from bojone/margin-softmax
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsent_sim.py
161 lines (125 loc) · 5.13 KB
/
sent_sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#! -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
from keras.models import Model
from keras.layers import *
from keras.constraints import unit_norm
from margin_softmax import *
from keras.callbacks import Callback
num_train_groups = 90000 # 前9万组问题拿来做训练
maxlen = 32
batch_size = 100
min_count = 5
word_size = 128
epochs = 30 # amsoftmax需要25个epoch,其它需要20个epoch
data = pd.read_csv('tongyiju.csv', encoding='utf-8', header=None, delimiter='\t')
def strQ2B(ustring): # 全角转半角
rstring = ''
for uchar in ustring:
inside_code=ord(uchar)
if inside_code == 12288: # 全角空格直接转换
inside_code = 32
elif (inside_code >= 65281 and inside_code <= 65374): # 全角字符(除空格)根据关系转化
inside_code -= 65248
rstring += unichr(inside_code)
return rstring
data[1] = data[1].apply(strQ2B)
data[1] = data[1].str.lower()
chars = {}
for s in tqdm(iter(data[1])):
for c in s:
if c not in chars:
chars[c] = 0
chars[c] += 1
# 0: padding标记
# 1: unk标记
chars = {i:j for i,j in chars.items() if j >= min_count}
id2char = {i+2:j for i,j in enumerate(chars)}
char2id = {j:i for i,j in id2char.items()}
def string2id(s):
_ = [char2id.get(i, 1) for i in s[:maxlen]]
_ = _ + [0] * (maxlen - len(_))
return _
data[2] = data[1].apply(string2id)
train_data = data[data[0] < num_train_groups]
train_data = train_data.sample(frac=1)
x_train = np.array(list(train_data[2]))
y_train = np.array(list(train_data[0])).reshape((-1,1))
valid_data = data[data[0] >= num_train_groups]
# 正式模型,基于GRU的分类器
x_in = Input(shape=(maxlen,))
x_embedded = Embedding(len(chars)+2,
word_size)(x_in)
x = CuDNNGRU(word_size)(x_embedded)
x = Lambda(lambda x: K.l2_normalize(x, 1))(x)
pred = Dense(num_train_groups,
use_bias=False,
kernel_constraint=unit_norm())(x)
encoder = Model(x_in, x) # 最终的目的是要得到一个编码器
model = Model(x_in, pred) # 用分类问题做训练
model.compile(loss=sparse_amsoftmax_loss,
optimizer='adam',
metrics=['sparse_categorical_accuracy'])
# 为验证集的排序准备
# 实际上用numpy写也没有问题,但是用Keras写能借助GPU加速
x_in = Input(shape=(word_size,))
x = Dense(len(valid_data), use_bias=False)(x_in) # 计算相似度
x = Lambda(lambda x: K.tf.nn.top_k(x, 11)[1])(x) # 取出topk的下标
model_sort = Model(x_in, x)
# id与组别之间的映射
id2g = dict(zip(valid_data.index-valid_data.index[0], valid_data[0]))
def evaluate(): # 评测函数
print 'validing...'
valid_vec = encoder.predict(np.array(list(valid_data[2])),
verbose=True,
batch_size=1000) # encoder计算句向量
model_sort.set_weights([valid_vec.T]) # 载入句向量为权重
sorted_result = model_sort.predict(valid_vec,
verbose=True,
batch_size=1000) # 计算topk
new_result = np.vectorize(lambda s: id2g[s])(sorted_result)
_ = new_result[:, 0] != new_result[:, 0] # 生成一个全为False的向量
for i in range(10): # 注意按照相似度排序的话,第一个就是输入句子(全匹配)
_ = _ + (new_result[:, 0] == new_result[:, i+1])
if i+1 == 1:
top1_acc = 1. * _.sum() / len(_)
elif i+1 == 5:
top5_acc = 1. * _.sum() / len(_)
elif i+1 == 10:
top10_acc = 1. * _.sum() / len(_)
return top1_acc, top5_acc, top10_acc
# 定义Callback器,计算验证集的acc,并保存最优模型
class Evaluate(Callback):
def __init__(self):
self.accs = {'top1': [], 'top5': [], 'top10': []}
self.highest = 0.
def on_epoch_end(self, epoch, logs=None):
top1_acc, top5_acc, top10_acc = evaluate()
self.accs['top1'].append(top1_acc)
self.accs['top5'].append(top5_acc)
self.accs['top10'].append(top10_acc)
if top1_acc >= self.highest: # 保存最优模型权重
self.highest = top1_acc
model.save_weights('sent_sim_amsoftmax.model')
json.dump({'accs': self.accs, 'highest_top1': self.highest},
open('valid_amsoftmax.log', 'w'), indent=4)
print 'top1_acc: %s, top5_acc: %s, top10_acc: %s' % (top1_acc, top5_acc, top10_acc)
evaluator = Evaluate()
history = model.fit(x_train,
y_train,
batch_size=batch_size,
epochs=epochs,
callbacks=[evaluator])
valid_vec = encoder.predict(np.array(list(valid_data[2])),
verbose=True,
batch_size=1000) # encoder计算句向量
def most_similar(s):
v = encoder.predict(np.array([string2id(s)]))[0]
sims = np.dot(valid_vec, v)
for i in sims.argsort()[-10:][::-1]:
print valid_data.iloc[i][1],sims[i]
most_similar(u'ps格式可以转换成ai格式吗')
most_similar(u'广州的客运站的数目')
most_similar(u'沙发一般有多高')