-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword2vec.py
199 lines (165 loc) · 7.44 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# -*- coding: utf8 -*-
#Course : Natural Language Processing, Spring 2018 taught by Prof. Chitta Baral
import numpy as np
import pandas as pd
import re, json, nltk
from nltk.corpus import stopwords
# a tweet tokenizer from nltk.
#from keras.preprocessing.text import Tokenizer
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()
from sklearn.model_selection import train_test_split
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.deprecated.doc2vec import LabeledSentence
LabeledSentence = gensim.models.deprecated.doc2vec.LabeledSentence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale
array = ["morphine", "methadone", "buprenorphine", "hydrocodone", "oxycodone","heroin", "oxycontin", "perc", "percocet","palladone" , "vicodin", "percodan", "tylox" ,"demerol", "oxy", "roxies","opiates", "oxy", "percocet", "percocets", "hydrocodone", "norco",
"norcos", "roxy", "roxies", "roxycodone", "roxicodone", "opana", "opanas", "prozac", "painrelief", "painreliever", "painkillers", "addiction", "opium"]
#making a Dictionary of the Stopwards
words = {}
for i in stopwords.words("english"):
k = i.encode("utf8")
words[k] = 0
# regular expressions used to clean up the tweet data
drug = re.compile('|'.join(array).lower())
http_re = re.compile(r'\s+http://[^\s]*')
remove_ellipsis_re = re.compile(r'\.\.\.')
at_sign_re = re.compile(r'\@\S+')
punct_re = re.compile(r"[\"'\[\],.:;()\-&!]")
price_re = re.compile(r"\d+\.\d\d")
number_re = re.compile(r"\d+")
def normalize_tweet(tweet):
#Regular expressions to replace patterns in the Data
t = tweet.lower()
t = re.sub(price_re, 'PRICE', t)
t = re.sub(remove_ellipsis_re, '', t)
t = re.sub(drug, 'druginstance', t)
t = re.sub(http_re, ' LINK', t)
t = re.sub(punct_re, '', t)
t = re.sub(at_sign_re, '@', t)
t = re.sub(number_re, 'NUM', t)
return t
def feature_extractor(tweet):
#remove new lines
#Fixing the upper bound for number of Features. This can be changed.
max_features = 2000
new_tweet = tweet.strip().lower().encode('ascii', errors='ignore')
new_tweet = normalize_tweet(new_tweet)
#print new_tweet
#remove new lines (\n)
#new_tweet = re.sub(r"\n", " ", new_tweet)
words_in_tweet = new_tweet.split(" ")
#Removing stop words from the tweets
words_in_tweet = [x for x in words_in_tweet if x not in words]
# for i,j in enumerate(words_in_tweet):
# if j in array:
# words_in_tweet[i] = "druginstance"
#Decoding them back into ASCII to convert them back into unicode
new_tweet = " ".join(words_in_tweet).decode("ascii", errors = "ignore")
#print new_tweet
#print ("\n")
tokens = tokenizer.tokenize(new_tweet)
#raw_input("please press enter ...")
return tokens
#open the JSON to a list of dictionaries
with open("Merged_Labelled.json", "r") as f:
original_data = json.load(f)
data = original_data
#print data[0]
#Turning it into a DataFrame
# cut_off = int (len(data) * 0.90)
# train_data = data[:cut_off]
# test_data = data[cut_off:]
#Creating a list of tuples for the input JSON
formatted_data = [(d["label"],feature_extractor(d["tweet"])) for d in data if d["label"] != "None"]
#test_set = [(d["label"],feature_extractor(d["tweet"])) for d in test_data]
#Creating DataFrame and removing data with Sentiments as None and other invalid sentiment values.
#This is done because the LSTM expects the Class label to be a Floating point number (Probabilities)
df = pd.DataFrame(formatted_data, columns = ["sentiment", "tweet"])
df = df[df["sentiment"]!= "None"]
df = df[df["sentiment"]!= "none"]
df = df[df["sentiment"] != "10"]
df = df[df["sentiment"] != "01"]
#print df.head(10)["tweet"]
#removing the itemid
#df.drop("index", inplace = True, axis = 1)
#Creating the TEST TRAIN Split using sklearn, with 30% of data as Test.
x_train, x_test, y_train, y_test = train_test_split(np.array(df.tweet),
np.array(df.sentiment), test_size=0.3)
#Converting the Sentiment labels to Float
y_train = [map(float, x) for x in y_train]
y_test = [map(float, x) for x in y_test]
#print y_train
#Converting the Tokens into Labelled Sentence object.
def labelizeTweets(tweets, label_type):
labelized = []
for i,v in enumerate(tweets):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
return labelized
x_train_label = labelizeTweets(x_train, 'TRAIN')
x_test_label = labelizeTweets(x_test, 'TEST')
#Output of the format:
#
#print x_train[3][0]
#we set the dimensions to 200 (Default). sliding window 10. building Word2vec for words that I have occured a minimum times of 5.
word_to_vec = Word2Vec(size = 200, window = 10, min_count=10, workers = 11, alpha = 0.025, iter = 20)
word_to_vec.build_vocab([x[0] for x in x_train_label])
#Number of words for which vectors are built
m = word_to_vec.corpus_count
#print m
#Training. 20 iterations
word_to_vec.train([x[0] for x in x_train], epochs = word_to_vec.iter, total_examples = m)
#print tweet_w2v["druginstance"]
#final_embedding = tweet_w2v._nemb_final.eva
#The weight vectors generated from the word2vec are stored in the syn0 dictionary
pretrained_weights = word_to_vec.wv.syn0
vocabulary_size, size_embedding = pretrained_weights.shape
#print pretrained_weights.shape
#Trying to Buid the LSTM network
print 'building tf-idf matrix ...'
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
#fitting the TFIDF for the X-train-label
matrix = vectorizer.fit_transform([x[0] for x in x_train_label])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
#print 'vocab : ', tfidf
#print tfidf[u"druginstance"]
#function that given a list of tokens, creates an averaged TWeet vector
def buildWordVector(tokens, size):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in tokens:
try:
vec += word_to_vec[word].reshape((1, size)) * tfidf[word]
count += 1.
except KeyError: # handling the case where the token is not
# in the corpus.
continue
if count != 0:
vec /= count
return vec
#Converted X_train and X_test into list of vectors and also scaling them so that they have a Mean of 0
#and a Standard Deviation of 1
train_word_to_vec = np.concatenate([buildWordVector(z, 200) for z in map(lambda x: x[0], x_train_label)])
train_vecs_w2v = scale(train_word_to_vec)
test_word_to_vec = np.concatenate([buildWordVector(z, 200) for z in map(lambda x: x[0], x_test_label)])
test_word_to_vec = scale(test_word_to_vec)
#we are using a 2 layer LSTM. 100 neurons in Dense layer 1 and layer 2 has 1.
model = Sequential()
#USing the Keras Built-in Optimizer. Relu for 1st layer and Sigmoid for the next.
model.add(Dense(100, activation='relu', input_dim=200))
#model.add(Dense(100, activation = "relu", input_dim = 200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
#Set the number of epochs to be 75, batch_size to 32 and verbose to 2 to see as much info of training process.
model.fit(train_word_to_vec, np.array(y_train), epochs=75, batch_size=32, verbose=2)
#Highest Accuracy achieved 57.35%
score = model.evaluate(test_word_to_vec, np.array(y_test), batch_size=8, verbose=2)
print "the accuracy is", score[1]