This repository has been archived by the owner on Jul 4, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
108 lines (86 loc) · 3.18 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from keras.datasets import mnist
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
import numpy as np
import configuration as config
import models
class PreProcessing:
def loadData(self):
print "loading data..."
data_src = config.data_src
text_pos = open(data_src[0],"r").readlines()
text_neg = open(data_src[1],"r").readlines()
labels_pos = [1]*len(text_pos)
labels_neg = [0] * len(text_neg)
texts = text_pos
texts.extend(text_neg)
labels = labels_pos
labels.extend(labels_neg)
tokenizer = Tokenizer(nb_words=config.MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=config.MAX_SEQUENCE_LENGTH)
labels = np_utils.to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(config.VALIDATION_SPLIT * data.shape[0])
self.x_train = data[:-nb_validation_samples]
self.y_train = labels[:-nb_validation_samples]
self.x_val = data[-nb_validation_samples:]
self.y_val = labels[-nb_validation_samples:]
self.word_index = word_index
def loadEmbeddings(self):
embeddings_src = config.embeddings_src
word_index = self.word_index
embeddings_index = {}
f = open(embeddings_src)
i=0
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
EMBEDDING_DIM = len(coefs)
i+=1
if i>10000:
break
f.close()
print('Found %s word vectors.' % len(embeddings_index))
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
self.embedding_matrix = embedding_matrix
self.EMBEDDING_DIM = EMBEDDING_DIM
self.MAX_SEQUENCE_LENGTH = config.MAX_SEQUENCE_LENGTH
def main():
preprocessing = PreProcessing()
preprocessing.loadData()
preprocessing.loadEmbeddings()
cnn_model = models.CNNModel()
params_obj = config.Params()
# Establish params
params_obj.num_classes=2
params_obj.vocab_size = len(preprocessing.word_index)
params_obj.inp_length = preprocessing.MAX_SEQUENCE_LENGTH
params_obj.embeddings_dim = preprocessing.EMBEDDING_DIM
# get model
model = cnn_model.getModel(params_obj=params_obj, weight=preprocessing.embedding_matrix)
x_train, y_train, x_val, y_val = preprocessing.x_train, preprocessing.y_train, preprocessing.x_val, preprocessing.y_val
# train
model.fit(x_train, y_train, validation_data=(x_val, y_val),
nb_epoch=params_obj.num_epochs, batch_size=params_obj.batch_size)
#evaluate
if __name__ == "__main__":
main()