forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpretrained_glove.py
151 lines (126 loc) · 4.26 KB
/
pretrained_glove.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
# https://www.udemy.com/data-science-natural-language-processing-in-python
# Author: http://lazyprogrammer.me
# from __future__ import print_function, division
# from future.utils import iteritems
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
# WHERE TO GET THE VECTORS:
# GloVe: https://nlp.stanford.edu/projects/glove/
# Direct link: http://nlp.stanford.edu/data/glove.6B.zip
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
def dist1(a, b):
return np.linalg.norm(a - b)
def dist2(a, b):
return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))
# pick a distance type
dist, metric = dist2, 'cosine'
# dist, metric = dist1, 'euclidean'
## more intuitive
# def find_analogies(w1, w2, w3):
# for w in (w1, w2, w3):
# if w not in word2vec:
# print("%s not in dictionary" % w)
# return
# king = word2vec[w1]
# man = word2vec[w2]
# woman = word2vec[w3]
# v0 = king - man + woman
# min_dist = float('inf')
# best_word = ''
# for word, v1 in iteritems(word2vec):
# if word not in (w1, w2, w3):
# d = dist(v0, v1)
# if d < min_dist:
# min_dist = d
# best_word = word
# print(w1, "-", w2, "=", best_word, "-", w3)
## faster
def find_analogies(w1, w2, w3):
for w in (w1, w2, w3):
if w not in word2vec:
print("%s not in dictionary" % w)
return
king = word2vec[w1]
man = word2vec[w2]
woman = word2vec[w3]
v0 = king - man + woman
distances = pairwise_distances(v0.reshape(1, D), embedding, metric=metric).reshape(V)
idxs = distances.argsort()[:4]
for idx in idxs:
word = idx2word[idx]
if word not in (w1, w2, w3):
best_word = word
break
print(w1, "-", w2, "=", best_word, "-", w3)
def nearest_neighbors(w, n=10):
if w not in word2vec:
print("%s not in dictionary:" % w)
return
v = word2vec[w]
distances = pairwise_distances(v.reshape(1, D), embedding, metric=metric).reshape(V)
idxs = distances.argsort()[1:n+1]
print("neighbors of: %s" % w)
for idx in idxs:
print("\t%s" % idx2word[idx])
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
embedding = []
idx2word = []
d = str(200)
print('dimensions: ' + d)
with open('../large_files/glove.6B/glove.6B.' + d + 'd.txt', encoding='utf-8') as f:
# is just a space-separated text file in the format:
# word vec[0] vec[1] vec[2] ...
for line in f:
values = line.split()
word = values[0]
vec = np.asarray(values[1:], dtype='float32')
word2vec[word] = vec
embedding.append(vec)
idx2word.append(word)
print('Found %s word vectors.' % len(word2vec))
embedding = np.array(embedding)
V, D = embedding.shape
find_analogies('chest', 'lungs', 'head')
find_analogies('arm', 'hand', 'leg')
find_analogies('king', 'man', 'woman')
find_analogies('france', 'paris', 'london')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')
find_analogies('france', 'french', 'english')
find_analogies('japan', 'japanese', 'chinese')
find_analogies('japan', 'japanese', 'italian')
find_analogies('japan', 'japanese', 'australian')
find_analogies('december', 'november', 'june')
find_analogies('miami', 'florida', 'texas')
find_analogies('einstein', 'scientist', 'painter')
find_analogies('china', 'rice', 'bread')
find_analogies('man', 'woman', 'she')
find_analogies('man', 'woman', 'aunt')
find_analogies('man', 'woman', 'sister')
find_analogies('man', 'woman', 'wife')
find_analogies('man', 'woman', 'actress')
find_analogies('man', 'woman', 'mother')
find_analogies('heir', 'heiress', 'princess')
find_analogies('nephew', 'niece', 'aunt')
find_analogies('france', 'paris', 'tokyo')
find_analogies('france', 'paris', 'beijing')
find_analogies('february', 'january', 'november')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')
nearest_neighbors('glioblastoma')
nearest_neighbors('egfr')
nearest_neighbors('braf')
nearest_neighbors('pten')
nearest_neighbors('king')
nearest_neighbors('france')
nearest_neighbors('japan')
nearest_neighbors('einstein')
nearest_neighbors('woman')
nearest_neighbors('nephew')
nearest_neighbors('february')
nearest_neighbors('rome')