-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] Optimize Native unsupervised FastText #1742
Changes from 9 commits
ceb24ab
fa3aa49
da45be3
a8df8e1
1e51a92
838e833
73e9176
2d0e1a6
772267a
19a5da0
9dafbd5
796ec91
5a4e627
b471fde
eee20ef
b97e68c
2dee387
1b3e2d3
81eebfb
2560e1d
7b8673a
144fab9
c3f5d74
327f4ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,77 +9,83 @@ | |
from gensim.models.word2vec import Word2Vec, train_sg_pair, train_cbow_pair | ||
from gensim.models.wrappers.fasttext import FastTextKeyedVectors | ||
from gensim.models.wrappers.fasttext import FastText as Ft_Wrapper, compute_ngrams, ft_hash | ||
from gensim import matutils | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
MAX_WORDS_IN_BATCH = 10000 | ||
|
||
|
||
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): | ||
result = 0 | ||
for sentence in sentences: | ||
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and | ||
model.wv.vocab[w].sample_int > model.random.rand() * 2**32] | ||
for pos, word in enumerate(word_vocabs): | ||
reduced_window = model.random.randint(model.window) | ||
start = max(0, pos - model.window + reduced_window) | ||
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) | ||
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] | ||
|
||
word2_subwords = [] | ||
vocab_subwords_indices = [] | ||
ngrams_subwords_indices = [] | ||
|
||
for index in word2_indices: | ||
vocab_subwords_indices += [index] | ||
word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]] | ||
|
||
for subword in word2_subwords: | ||
ngrams_subwords_indices.append(model.wv.ngrams[subword]) | ||
|
||
l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size | ||
l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size | ||
|
||
l1 = np_sum([l1_vocab, l1_ngrams], axis=0) | ||
subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices] | ||
if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: | ||
l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) | ||
|
||
# train on the sliding window for target word | ||
train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) | ||
result += len(word_vocabs) | ||
return result | ||
|
||
|
||
def train_batch_sg(model, sentences, alpha, work=None): | ||
result = 0 | ||
for sentence in sentences: | ||
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and | ||
model.wv.vocab[w].sample_int > model.random.rand() * 2**32] | ||
for pos, word in enumerate(word_vocabs): | ||
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code | ||
# now go over all words from the (reduced) window, predicting each one in turn | ||
start = max(0, pos - model.window + reduced_window) | ||
|
||
subwords_indices = [word.index] | ||
word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]] | ||
|
||
for subword in word2_subwords: | ||
subwords_indices.append(model.wv.ngrams[subword]) | ||
|
||
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): | ||
if pos2 != pos: # don't train on the `word` itself | ||
train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) | ||
|
||
result += len(word_vocabs) | ||
return result | ||
try: | ||
from gensim.models.fasttext_inner import train_batch_sg, train_batch_cbow | ||
from gensim.models.fasttext_inner import FAST_VERSION, MAX_WORDS_IN_BATCH | ||
|
||
except ImportError: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem with this solution is that it makes it impossible to test the Python native implementation. There's a number of ways you could fix this. The first thing that comes into my mind is this:
try:
from gensim.models.fasttext_inner import train_batch_sg, train_batch_cbow, FAST_VERSION, MAX_WORDS_IN_BATCH
except ImportError:
# log warning
from gensim.models.fasttext_native import train_batch_sg, train_batch_cbow, FAST_VERSION, MAX_WORDS_IN_BATCH
To test the native/legacy implementation, you would then be able to initialize from gensim.models.fasttext import FastText
from gensim.models.fasttext_native import MAX_WORDS_IN_BATCH, train_batch_sg, train_batch_cbow
ft = FastText(..., batch_words=MAX_WORDS_IN_BATCH, train_batch_sg=train_batch_sg, train_batch_cbow=train_batch_cbow)
# test ft I don't insist on this particular solution. There may be more elegant ways. My main concern is that the native implementation should also be testable and covered by unit tests (just parametrize existing unit tests to test both the native and Cython implementation). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Native Python implementations will be removed in the next refactor. We're not going to support them any more, now that @menshikh-iv can build good wheels. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I hope that this will be true @piskvorky, need to distribute our wheels several releases in row first, and if all will be OK - we can remove python parts (but this process is not fast, I suggest not rushing with this) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @janpom this code looks repeat logic from https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/word2vec.py#L137. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What I mean is there's no point introducing new constructs for testing both versions, when we know we only want one version. |
||
# failed... fall back to plain numpy (20-80x slower training than the above) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. worth logging a warning? |
||
FAST_VERSION = -1 | ||
MAX_WORDS_IN_BATCH = 10000 | ||
|
||
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): | ||
result = 0 | ||
for sentence in sentences: | ||
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and | ||
model.wv.vocab[w].sample_int > model.random.rand() * 2**32] | ||
for pos, word in enumerate(word_vocabs): | ||
reduced_window = model.random.randint(model.window) | ||
start = max(0, pos - model.window + reduced_window) | ||
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) | ||
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] | ||
|
||
word2_subwords = [] | ||
vocab_subwords_indices = [] | ||
ngrams_subwords_indices = [] | ||
|
||
for index in word2_indices: | ||
vocab_subwords_indices += [index] | ||
word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]] | ||
|
||
for subword in word2_subwords: | ||
ngrams_subwords_indices.append(model.wv.ngrams[subword]) | ||
|
||
l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size | ||
l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size | ||
|
||
l1 = np_sum([l1_vocab, l1_ngrams], axis=0) | ||
subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices] | ||
if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: | ||
l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) | ||
|
||
train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) # train on the sliding window for target word | ||
result += len(word_vocabs) | ||
return result | ||
|
||
def train_batch_sg(model, sentences, alpha, work=None, neu1=None): | ||
result = 0 | ||
for sentence in sentences: | ||
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and | ||
model.wv.vocab[w].sample_int > model.random.rand() * 2**32] | ||
for pos, word in enumerate(word_vocabs): | ||
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code | ||
# now go over all words from the (reduced) window, predicting each one in turn | ||
start = max(0, pos - model.window + reduced_window) | ||
|
||
subwords_indices = [word.index] | ||
word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]] | ||
|
||
for subword in word2_subwords: | ||
subwords_indices.append(model.wv.ngrams[subword]) | ||
|
||
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): | ||
if pos2 != pos: # don't train on the `word` itself | ||
train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) | ||
|
||
result += len(word_vocabs) | ||
return result | ||
|
||
|
||
class FastText(Word2Vec): | ||
def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, | ||
max_vocab_size=None, word_ngrams=1, loss='ns', sample=1e-3, seed=1, workers=3, min_alpha=0.0001, | ||
negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, | ||
sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH): | ||
def __init__( | ||
self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, | ||
max_vocab_size=None, word_ngrams=1, loss='ns', sample=1e-3, seed=1, workers=3, min_alpha=0.0001, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, | ||
trim_rule=None, batch_words=MAX_WORDS_IN_BATCH): | ||
|
||
# fastText specific params | ||
self.bucket = bucket | ||
|
@@ -89,12 +95,10 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, | |
if self.word_ngrams <= 1 and self.max_n == 0: | ||
self.bucket = 0 | ||
|
||
super(FastText, self).__init__( | ||
sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count, | ||
super(FastText, self).__init__(sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PEP8: Arguments on first line forbidden when not using vertical alignment. |
||
max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, | ||
sg=sg, hs=hs, negative=negative, cbow_mean=cbow_mean, hashfxn=hashfxn, iter=iter, null_word=null_word, | ||
trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words | ||
) | ||
trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words) | ||
|
||
def initialize_word_vectors(self): | ||
self.wv = FastTextKeyedVectors() | ||
|
@@ -104,16 +108,13 @@ def initialize_word_vectors(self): | |
def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): | ||
if update: | ||
if not len(self.wv.vocab): | ||
raise RuntimeError( | ||
"You cannot do an online vocabulary-update of a model which has no prior vocabulary. " | ||
"First build the vocabulary of your model with a corpus before doing an online update." | ||
) | ||
raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. incorrect indentation |
||
"First build the vocabulary of your model with a corpus " | ||
"before doing an online update.") | ||
self.old_vocab_len = len(self.wv.vocab) | ||
self.old_hash2index_len = len(self.wv.hash2index) | ||
|
||
super(FastText, self).build_vocab( | ||
sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update | ||
) | ||
super(FastText, self).build_vocab(sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update) | ||
self.init_ngrams(update=update) | ||
|
||
def init_ngrams(self, update=False): | ||
|
@@ -170,18 +171,10 @@ def init_ngrams(self, update=False): | |
|
||
rand_obj = np.random | ||
rand_obj.seed(self.seed) | ||
new_vocab_rows = rand_obj.uniform( | ||
-1.0 / self.vector_size, 1.0 / self.vector_size, | ||
(len(self.wv.vocab) - self.old_vocab_len, self.vector_size) | ||
) | ||
new_vocab_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the point of reformatting this and the other lines? It makes the lines too long and the code is harder to read. |
||
new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) | ||
new_ngram_rows = rand_obj.uniform( | ||
-1.0 / self.vector_size, 1.0 / self.vector_size, | ||
(len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size) | ||
) | ||
new_ngram_lockf_rows = ones( | ||
(len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL | ||
) | ||
new_ngram_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)) | ||
new_ngram_lockf_rows = ones((len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL) | ||
|
||
self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) | ||
self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows]) | ||
|
@@ -192,19 +185,15 @@ def reset_ngram_weights(self): | |
rand_obj = np.random | ||
rand_obj.seed(self.seed) | ||
for index in range(len(self.wv.vocab)): | ||
self.wv.syn0_vocab[index] = rand_obj.uniform( | ||
-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size | ||
) | ||
self.wv.syn0_vocab[index] = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size) | ||
for index in range(len(self.wv.hash2index)): | ||
self.wv.syn0_ngrams[index] = rand_obj.uniform( | ||
-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size | ||
) | ||
self.wv.syn0_ngrams[index] = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size) | ||
|
||
def _do_train_job(self, sentences, alpha, inits): | ||
work, neu1 = inits | ||
tally = 0 | ||
if self.sg: | ||
tally += train_batch_sg(self, sentences, alpha, work) | ||
tally += train_batch_sg(self, sentences, alpha, work, neu1) | ||
else: | ||
tally += train_batch_cbow(self, sentences, alpha, work, neu1) | ||
|
||
|
@@ -245,4 +234,4 @@ def load_fasttext_format(cls, *args, **kwargs): | |
|
||
def save(self, *args, **kwargs): | ||
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm']) | ||
super(FastText, self).save(*args, **kwargs) | ||
super(FastText, self).save(*args, **kwargs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PEP8: No newline at the end of file. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unused import