-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] Wrapper for FastText #847
Changes from all commits
55a4fc9
e916f7e
e5416ed
e64766b
c34cf37
c9b31f9
a0329af
0c0e2fa
cdefeb0
1aec5a2
e7368a3
fe283c2
9b36bc4
dfe1893
4a03f20
09b6ebe
7df4138
4c54d9b
a28f9f1
bf1182e
5a6b97b
cfb2e1c
b002765
27c0a14
81f8cbb
aa7e632
c780b9b
ccf5a47
708113b
b7de266
4d3d251
f2d13ce
3777423
6e20834
564ea0d
caeb275
784ffbf
20fe6f2
3b9483b
f5cdfb6
700dd26
d30ea56
bb6e538
c7a5d07
734057b
56d89e9
dc51096
bb48663
b58dd53
461a6b4
9137090
e5ae899
b98b40f
5eb8f75
27bec7b
ef0e1e2
ab07ef9
2f37b04
b2ff794
7b0874a
a7bceb6
dee9f97
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,6 +46,25 @@ def save(self, *args, **kwargs): | |
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm']) | ||
super(KeyedVectors, self).save(*args, **kwargs) | ||
|
||
def word_vec(self, word, use_norm=False): | ||
""" | ||
Accept a single word as input. | ||
Returns the word's representations in vector space, as a 1D numpy array. | ||
|
||
Example:: | ||
|
||
>>> trained_model.word_vec('office', use_norm=True) | ||
array([ -1.40128313e-02, ...]) | ||
|
||
""" | ||
if word in self.vocab: | ||
if use_norm: | ||
return self.syn0norm[self.vocab[word].index] | ||
else: | ||
return self.syn0[self.vocab[word].index] | ||
else: | ||
raise KeyError("word '%s' not in vocabulary" % word) | ||
|
||
def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None): | ||
""" | ||
Find the top-N most similar words. Positive words contribute positively towards the | ||
|
@@ -90,11 +109,10 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, i | |
for word, weight in positive + negative: | ||
if isinstance(word, ndarray): | ||
mean.append(weight * word) | ||
elif word in self.vocab: | ||
mean.append(weight * self.syn0norm[self.vocab[word].index]) | ||
all_words.add(self.vocab[word].index) | ||
else: | ||
raise KeyError("word '%s' not in vocabulary" % word) | ||
mean.append(weight * self.word_vec(word, use_norm=True)) | ||
if word in self.vocab: | ||
all_words.add(self.vocab[word].index) | ||
if not mean: | ||
raise ValueError("cannot compute similarity with no input") | ||
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) | ||
|
@@ -230,22 +248,14 @@ def most_similar_cosmul(self, positive=[], negative=[], topn=10): | |
# allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) | ||
positive = [positive] | ||
|
||
all_words = set() | ||
|
||
def word_vec(word): | ||
if isinstance(word, ndarray): | ||
return word | ||
elif word in self.vocab: | ||
all_words.add(self.vocab[word].index) | ||
return self.syn0norm[self.vocab[word].index] | ||
else: | ||
raise KeyError("word '%s' not in vocabulary" % word) | ||
|
||
positive = [word_vec(word) for word in positive] | ||
negative = [word_vec(word) for word in negative] | ||
positive = [self.word_vec(word, use_norm=True) for word in positive] | ||
negative = [self.word_vec(word, use_norm=True) for word in negative] | ||
if not positive: | ||
raise ValueError("cannot compute similarity with no input") | ||
|
||
all_words = set([self.vocab[word].index for word in positive+negative if word in self.vocab]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To remove the input words from the returned There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Eh, never mind, the review snippet showed me the code for Square brackets |
||
|
||
# equation (4) of Levy & Goldberg "Linguistic Regularities...", | ||
# with distances shifted to [0,1] per footnote (7) | ||
pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive] | ||
|
@@ -311,14 +321,16 @@ def doesnt_match(self, words): | |
""" | ||
self.init_sims() | ||
|
||
words = [word for word in words if word in self.vocab] # filter out OOV words | ||
logger.debug("using words %s" % words) | ||
if not words: | ||
used_words = [word for word in words if word in self] | ||
if len(used_words) != len(words): | ||
ignored_words = set(words) - set(used_words) | ||
logger.warning("vectors for words %s are not present in the model, ignoring these words", ignored_words) | ||
if not used_words: | ||
raise ValueError("cannot select a word from an empty list") | ||
vectors = vstack(self.syn0norm[self.vocab[word].index] for word in words).astype(REAL) | ||
vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL) | ||
mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) | ||
dists = dot(vectors, mean) | ||
return sorted(zip(dists, words))[0][1] | ||
return sorted(zip(dists, used_words))[0][1] | ||
|
||
def __getitem__(self, words): | ||
|
||
|
@@ -345,9 +357,9 @@ def __getitem__(self, words): | |
""" | ||
if isinstance(words, string_types): | ||
# allow calls like trained_model['office'], as a shorthand for trained_model[['office']] | ||
return self.syn0[self.vocab[words].index] | ||
return self.word_vec(words) | ||
|
||
return vstack([self.syn0[self.vocab[word].index] for word in words]) | ||
return vstack([self.word_vec(word) for word in words]) | ||
|
||
def __contains__(self, word): | ||
return word in self.vocab | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Dead code test, can never reach here (above line would throw a KeyError).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
KeyError
has been removed.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, it's still there, on line 66.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That line raises a
KeyError
in caseword in self.vocab
isFalse
. So in case it'sTrue
, line 115 would be executed.Also,
word_vec
has been overriden in theKeyedVectors
subclass forFastText
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, my point is -- isn't it always
True
? How could it beFalse
, when that would raise an exception at the line above? The test seems superfluous.But if subclasses can make
word_vec()
behave differently (not raise for missing words), then it makes sense. Not sure what the general contract forword_vec()
behaviour is.