forked from datienguyen/cnn_coherence
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnur_cnn.py
256 lines (194 loc) · 10.1 KB
/
nur_cnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
from __future__ import division
from keras.layers import AveragePooling1D, Flatten, Input, Embedding, LSTM, Dense, merge, Convolution1D, MaxPooling1D, Dropout
from keras.models import Model
from keras import objectives
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras import backend as K
from keras import optimizers
import numpy as np
from utilities import my_callbacks_00
from utilities import data_helper
import optparse
import sys
def ranking_loss(y_true, y_pred):
#ranking_loss without tree distance
pos = y_pred[:,0]
neg = y_pred[:,1]
#loss = -K.sigmoid(pos-neg) # use
loss = K.maximum(1.0 + neg - pos, 0.0) #if you want to use margin ranking loss
return K.mean(loss) + 0 * y_true
def compute_recall_ks(probas):
recall_k = {}
for group_size in [2, 5, 10]:
recall_k[group_size] = {}
print 'group_size: %d' % group_size
for k in [1, 2, 5]:
if k < group_size:
recall_k[group_size][k] = recall(probas, k, group_size)
print 'recall@%d' % k, recall_k[group_size][k]
return recall_k
def recall(probas, k, group_size):
test_size = 10
n_batches = len(probas) // test_size
n_correct = 0
for i in xrange(n_batches):
batch = np.array(probas[i*test_size:(i+1)*test_size])[:group_size]
#p = np.random.permutation(len(batch))
#indices = p[np.argpartition(batch[p], -k)[-k:]]
indices = np.argpartition(batch, -k)[-k:]
if 0 in indices:
n_correct += 1
return n_correct / (len(probas) / test_size)
if __name__ == '__main__':
# parse user input
parser = optparse.OptionParser("%prog [options]")
#file related options
parser.add_option("-g", "--log-file", dest="log_file", help="log file [default: %default]")
parser.add_option("-d", "--data-dir", dest="data_dir", help="directory containing list of train, test and dev file [default: %default]")
parser.add_option("-m", "--model-dir", dest="model_dir", help="directory to save the best models [default: %default]")
parser.add_option("-t", "--max-length", dest="maxlen", type="int", help="maximul length (for fixed size input) [default: %default]") # input size
parser.add_option("-f", "--nb_filter", dest="nb_filter", type="int", help="nb of filter to be applied in convolution over words [default: %default]")
#parser.add_option("-r", "--filter_length", dest="filter_length", type="int", help="length of neighborhood in words [default: %default]")
parser.add_option("-w", "--w_size", dest="w_size", type="int", help="window size length of neighborhood in words [default: %default]")
parser.add_option("-p", "--pool_length", dest="pool_length", type="int", help="length for max pooling [default: %default]")
parser.add_option("-e", "--emb-size", dest="emb_size", type="int", help="dimension of embedding [default: %default]")
parser.add_option("-s", "--hidden-size", dest="hidden_size", type="int", help="hidden layer size [default: %default]")
parser.add_option("-o", "--dropout_ratio", dest="dropout_ratio", type="float", help="ratio of cells to drop out [default: %default]")
parser.add_option("-a", "--learning-algorithm", dest="learn_alg", help="optimization algorithm (adam, sgd, adagrad, rmsprop, adadelta) [default: %default]")
parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int", help="minibatch size [default: %default]")
parser.add_option("-l", "--loss", dest="loss", help="loss type (hinge, squared_hinge, binary_crossentropy) [default: %default]")
parser.add_option("-n", "--epochs", dest="epochs", type="int", help="nb of epochs [default: %default]")
parser.set_defaults(
data_dir = "./final_data/"
,log_file = "log"
,model_dir = "./saved_model/"
,learn_alg = "adam" # sgd, adagrad, rmsprop, adadelta, adam (default)
,loss = "ranking_loss" # hinge, squared_hinge, binary_crossentropy (default)
,minibatch_size = 32
,dropout_ratio = 0.5
,maxlen = 1000
,epochs = 30
,emb_size = 100
,hidden_size = 250
,nb_filter = 150
,w_size = 5
,pool_length = 6
)
opts,args = parser.parse_args(sys.argv)
print('Loading vocabs for the whole dataset...')
vocabs, E = data_helper.init_vocab(opts.emb_size)
print "--------------------------------------------------"
print("loading entity-gird for pos and neg documents...")
X_train_1, X_train_0 = data_helper.load_and_numberize_egrids(filelist="./list.train.small",
maxlen=opts.maxlen, w_size=opts.w_size, vocabs=vocabs)
X_dev_1, X_dev_0 = data_helper.load_and_numberize_egrids(filelist="./list.dev.small",
maxlen=opts.maxlen, w_size=opts.w_size, vocabs=vocabs)
X_test_1, X_test_0 = data_helper.load_and_numberize_egrids(filelist="./list.test.small",
maxlen=opts.maxlen, w_size=opts.w_size, vocabs=vocabs)
num_train = len(X_train_1)
num_dev = len(X_dev_1)
num_test = len(X_test_1)
print num_train, num_dev, num_test
#assign Y value
y_train_1 = [1] * num_train
y_dev_1 = [1] * num_dev
y_test_1 = [1] * num_test
print('.....................................')
print("Num of traing pairs: " + str(num_train))
print("Num of dev pairs: " + str(num_dev))
print("Num of test pairs: " + str(num_test))
#print("Num of permutation in train: " + str(opts.p_num))
print("The maximum in length for CNN: " + str(opts.maxlen))
print('.....................................')
# the output is always 1??????
y_train_1 = np_utils.to_categorical(y_train_1, 2)
y_dev_1 = np_utils.to_categorical(y_dev_1, 2)
y_test_1 = np_utils.to_categorical(y_test_1, 2)
#randomly shuffle the training data
np.random.seed(113)
np.random.shuffle(X_train_1)
np.random.seed(113)
np.random.shuffle(X_train_0)
# first, define a CNN model for sequence of entities
sent_input = Input(shape=(opts.maxlen,), dtype='int32', name='sent_input')
# embedding layer encodes the input into sequences of 300-dimenstional vectors.
x = Embedding(output_dim=opts.emb_size, weights=[E], input_dim=len(vocabs), input_length=opts.maxlen)(sent_input)
# add a convolutiaon 1D layer
#x = Dropout(dropout_ratio)(x)
#x = Convolution1D(nb_filter=opts.nb_filter, filter_length = opts.w_size, border_mode='valid', activation='relu', subsample_length=1)(x)
#x = MaxPooling1D(pool_length=opts.pool_length)(x)
#x = Dropout(opts.dropout_ratio)(x)
x = Convolution1D(nb_filter=opts.nb_filter, filter_length = opts.w_size, border_mode='valid', activation='relu', subsample_length=1)(x)
# add max pooling layers
#x = AveragePooling1D(pool_length=pool_length)(x)
x = MaxPooling1D(pool_length=opts.pool_length)(x)
x = Dropout(opts.dropout_ratio)(x)
x = Flatten()(x)
#x = Dense(hidden_size, activation='relu')(x)
x = Dropout(opts.dropout_ratio)(x)
# add latent coherence score
out_x = Dense(1, activation='linear')(x)
shared_cnn = Model(sent_input, out_x)
# Inputs of pos and neg document
pos_input = Input(shape=(opts.maxlen,), dtype='int32', name="pos_input")
neg_input = Input(shape=(opts.maxlen,), dtype='int32', name="neg_input")
# these two models will share eveything from shared_cnn
pos_branch = shared_cnn(pos_input)
neg_branch = shared_cnn(neg_input)
concatenated = merge([pos_branch, neg_branch], mode='concat',name="coherence_out")
# output is two latent coherence score
final_model = Model([pos_input, neg_input], concatenated)
#final_model.compile(loss='ranking_loss', optimizer='adam')
#sgd = optimizers.SGD(lr=0.01, decay=1e-2)#, momentum=0.9)
final_model.compile(loss={'coherence_out': ranking_loss}, optimizer='adam')
# setting callback
histories = my_callbacks_00.Histories()
print(shared_cnn.summary())
#print(final_model.summary())
print("------------------------------------------------")
model_name = opts.model_dir + "CNN_" + str(opts.dropout_ratio) + "_"+ str(opts.emb_size) + "_"+ str(opts.maxlen) + "_" \
+ str(opts.w_size) + "_" + str(opts.nb_filter) + "_" + str(opts.pool_length) + "_" + str(opts.minibatch_size)
print("Model name: " + model_name)
print("Training model...")
bestAcc = 0.0
patience = 0
for ep in range(1,opts.epochs):
final_model.fit([X_train_1, X_train_0], y_train_1, validation_data=([X_dev_1, X_dev_0], y_dev_1), nb_epoch=1,
verbose=1, batch_size=opts.minibatch_size, callbacks=[histories])
final_model.save(model_name + "_ep." + str(ep) + ".h5")
curAcc = histories.accs[0]
if curAcc >= bestAcc:
bestAcc = curAcc
patience = 0
else:
patience = patience + 1
#doing classify the test set
y_pred = final_model.predict([X_test_1, X_test_0])
"""
ties = 0
wins = 0
n = len(y_pred)
for i in range(0,n):
if y_pred[i][0] > y_pred[i][1]:
wins = wins + 1
elif y_pred[i][0] == y_pred[i][1]:
ties = ties + 1
print("Perform on test set after Epoch: " + str(ep) + "...!")
print(" -Wins: " + str(wins) + " Ties: " + str(ties))
loss = n - (wins+ties)
#recall = wins/n;
prec = wins/(wins + loss)
#f1 = 2*prec*recall/(prec+recall)
print(" -Test acc: " + str(wins/n))
#print(" -Test f1 : " + str(f1))
"""
print("Perform on test set after Epoch: " + str(ep) + "...!")
recall_k = compute_recall_ks(y_pred[:,1])
#stop the model whch patience = 8
if patience > 10:
print("Early stopping at epoch: "+ str(ep))
break
print("Model reachs the best performance on Dev set: " + str(bestAcc))
print("Finish training and testing...")