-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdialogue_manager.py
219 lines (189 loc) · 10.7 KB
/
dialogue_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import gensim,logging
from gensim.models import Word2Vec
import scipy.spatial
import numpy as np
import string
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
class dialogue:
def __init__(self):
self.model = Word2Vec.load_word2vec_format('models/GoogleNews-vectors-negative300.bin.gz', binary=True) # C binary format
raw_utterance_pairs = np.genfromtxt("shop.plist.txt", dtype=None, delimiter="\t")
raw_client_utterances = raw_utterance_pairs[:,0]
raw_AI_utterances = raw_utterance_pairs[:,1]
# we now generate a lot more input pairs by substituting parts of phrases with quasi-similar expressions
to_add = []
same_stuff = ["I'm looking for", "I want to buy", "I need", "Where can I find","Where could I get",
"Where can I buy", "Where could I buy", "Do you have", "Can I get", "I'd like to have", "I'd like to get"]
functionally_same = [["'d","would"], ["n't", " not"], ["can", "could"],["wife", "girlfriend"]]
for pair in raw_utterance_pairs:
#replace the first list with all of the synonymous ways of posing the question
if any(expr in pair[0] for expr in same_stuff):
for beg in same_stuff:
new_ones = map(lambda x: pair[0].replace(beg,x), same_stuff)
for n in new_ones:
#conditions
alr = n not in raw_client_utterances and n not in to_add # not in the list already, would mess up the matrix
atm = not ("ATM" in n and "buy" in n) # exception, cannot generate phrases
eat = not ("eat" in n and "buy" in n) # exception, cannot generate phrases
if all([alr, atm,eat]):
to_add.append([n,pair[1]])
#print to_add
np.savetxt("generated_pairs.txt", np.vstack([raw_utterance_pairs,to_add]), delimiter="\t",fmt="%s")
# we are done adding sentences now
#we read in all generated pairs
self.utterance_pairs = np.genfromtxt("generated_pairs.txt", dtype=None, delimiter="\t")
self.client_utterances = self.utterance_pairs[:,0]
self.AI_utterances = self.utterance_pairs[:,1]
self.AI_utterances = list(set(self.AI_utterances)) #set removed duplicates if any
#and create an ID for each utterance of the AI
self.utterance_map = dict(zip(self.AI_utterances, range(len(self.AI_utterances))))
self.reversed_utterance_map = dict((reversed(item) for item in self.utterance_map.items()))
#print "mapping AI utterance to ID:", self.utterance_map,
#print "mapping ID to AI utterance:", self.reversed_utterance_map
# We first STANDARDIZE the sentences of people and then give each word and ID
# we will want to remove puctuation from words (so "Hi!" and "Hi" would not be different words)
self.exclude = set(['.',',','?','!','-']) # exclude these symbols (see below)
all_utterances = " ".join(self.client_utterances)
all_words = all_utterances.split()
for i,w in enumerate(all_words):
w = ''.join(ch for ch in w if ch not in self.exclude) #exlude commas, dots, questionmarks
if not ("I"==w or "I'" in w):
w = w.lower() #causes problems for the word "I", but needs to be done
all_words[i] = w
self.client_vocabulary = list(set(all_words))
self.dictionary = dict(zip(self.client_vocabulary, range(len(self.client_vocabulary)))) #give ID-s
print "All the words that the AI knows: \n", self.dictionary
self.reversed_dictionary = dict((reversed(item) for item in self.dictionary.items()))
### WE WILL NOW PROCEED TO CREATE A BASIS FOR MAPPING QUERIES WITH ANSWERS
# I need for each AI response the list of utterances that might lead to this answer
# from there I get the words that lead to the response
# for each word I will keep track of how many times it has been used in all utterances and how many times it has lead to this response
# (evidence provided by this word) = count(this word leading to response)/count(this word)
# I keep that in a matrix of (responses x words)
# the words found inside the AI responses are completely disregarded for now
self.occurence_matrix = np.zeros((len(self.AI_utterances),len(self.client_vocabulary)))
for pair in self.utterance_pairs:
query = pair[0]
AI = pair[1]
for word in query.split():
word = ''.join(ch for ch in word if ch not in self.exclude) # exlude commas, dots, questionmarks
if not ("I"==word or "I'" in word):
word = word.lower()#causes problems for the word I
word_index = self.dictionary[word]
self.occurence_matrix[self.utterance_map[AI], word_index] += 1
#we need to normalize the count by total count
tot_occurences = np.sum(self.occurence_matrix, axis=0)
self.occurence_matrix = self.occurence_matrix / tot_occurences[np.newaxis,:]
# HERE WE START DEALING WITH WORD2VEC
#create mapping from words to their word2vec vectors, Also from ID to vectors
self.idx_to_vectors = {}
self.words_to_vectors = {}
for idx,word in enumerate(self.client_vocabulary):
vector = None
try:
vector = self.model[word]
except:
print word, " is too common or unknown and has no vector" # word2vec has no vector for some prepositions
vector = None
self.idx_to_vectors[idx] = vector
self.words_to_vectors[word] = vector
# make sure both mappings are the same
assert(all(self.idx_to_vectors[10] == self.words_to_vectors[self.reversed_dictionary[10]]))
#function that finds the closest word among known words
def find_closest_word(self, word):
word_vec = self.model[word] #get the unknown word's vector
print np.shape(word_vec)
closest = ""
min_dist = 100000 #just a big big number
for w in self.dictionary:
if self.words_to_vectors[w] == None:
pass
else:
c_d = scipy.spatial.distance.cosine(word_vec, self.words_to_vectors[w])
if c_d < min_dist:
min_dist = c_d
closest = w
print min_dist, closest
return min_dist, closest
#function to select responses based on the matrix and input sentence
def respond(self, sentence):
print sentence
sentence = ''.join(ch for ch in sentence if ch not in self.exclude) # exlude commas, dots, questionmarks
sentence = sentence.lower()
# there are some words that cause problems
sentence = sentence.replace("any "," ")# any is a garbage word with no meaning
sentence = sentence.replace("Any "," ")# any is a grabage word with no meaning
sentence = sentence.replace("i ","I ")
sentence = sentence.replace("i'","I'")
for utt in self.client_utterances:
if sentence.lower() == utt.lower():
print "we have this excact utterance, we COULD use our knowledge, but we do not do this. Because want to test the stability fo the method"
best_match = "no_match"
#first we should check for unkown words
for word in sentence.split():
if word not in self.client_vocabulary:
#print word
try: #if the word2vec has no vector for the word we get an error (for ex: a typing error)
dist, closest = self.find_closest_word(word)
print "closest word is", closest, " at ", dist
if dist < 0.5:
sentence = sentence.replace(word, closest)
print "Replaced sentence is: \n", sentence
else: #if there is no similar word. Let's find if the word belongs to some category
fur= scipy.spatial.distance.cosine(self.model[word], self.model["furniture"])
food = scipy.spatial.distance.cosine(self.model[word], self.model["food"])
electronics = scipy.spatial.distance.cosine(self.model[word], self.model["electronics"])
office = scipy.spatial.distance.cosine(self.model[word], self.model["office"])
sports = scipy.spatial.distance.cosine(self.model[word], self.model["sports"])
perfume = scipy.spatial.distance.cosine(self.model[word], self.model["perfume"])
household = scipy.spatial.distance.cosine(self.model[word], self.model["household"])
clothing = np.min([scipy.spatial.distance.cosine(self.model[word], self.model["clothing"]),scipy.spatial.distance.cosine(self.model[word], self.model["clothes"])])
jewelry = scipy.spatial.distance.cosine(self.model[word], self.model["jewelry"])
#feel free to comment this printing out
print "dist to furniture:", scipy.spatial.distance.cosine(self.model[word], self.model["furniture"])
print "dist to food:", scipy.spatial.distance.cosine(self.model[word], self.model["food"])
print "dist to sports:", scipy.spatial.distance.cosine(self.model[word], self.model["sports"])
print "dist to office:", scipy.spatial.distance.cosine(self.model[word], self.model["office"])
print "dist to electronics:", electronics
print "dist to clothing:", clothing
print "dist to jewelry:", jewelry
print "dist to household:", household
print "dist to perfume:", perfume
#we replace with category only if it is close enough
if np.min([fur,food,electronics,office,sports,clothing,jewelry,household,perfume])<0.85:
topic = ["furniture","food","electronics","office","sports","clothes","perfume","jewelry","household"][np.argmin([fur,food,electronics,office,sports,clothing,perfume,jewelry,household])]
sentence = sentence.replace(word,topic)
print "Replaced with topic: \n", topic
else: #there is no way to replace.. we remove the word and still print out what was the closest
sentence = sentence.replace(word,"")
print "BAD WORD! Most similar was: \n", self.model.most_similar(positive=[word],topn=10)
print "removed a word: \n", sentence
except:
print word, " is not understood by word2vec, removing it from the senetence"
sentence = sentence.replace(word,"")
print "replaced ", sentence
pass
else: #the word was in dictionary
pass
print "Cleaned up sentence is: \n", sentence
#now we are ready to choose the best reponse
#presume all unkown words are now removed or replaced
words = sentence.split()
small_matrix = [] #this just takes the columns corresponding to the words present in the sentence
for w in words:
w = ''.join(ch for ch in w if ch not in self.exclude)#exlude commas, dots, questionmarks
if not ("I"==w or "I'" in w):
w = w.lower()#causes problems for the word I
word_index = self.dictionary[w]
small_matrix.append(self.occurence_matrix[:,word_index])
best = np.argmax(np.sum(small_matrix,axis=0)) #best sencence is given by the row with most evidence
return self.reversed_utterance_map[best]
if __name__ == '__main__':
DM = dialogue()
exit = False
while not exit:
inp = raw_input("Please say something to the virtual shopping assistant (say exit to exit): ")
if inp=="exit":
exit = True
else:
print DM.respond(inp)