Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Optimize Native unsupervised FastText #1742

Merged
merged 24 commits into from
Dec 7, 2017
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ceb24ab
adds fasttext extension to setup
manneshiva Nov 26, 2017
fa3aa49
cythonizes training using skipgram with negative sampling
manneshiva Nov 26, 2017
da45be3
loop over indexes using index iterator
manneshiva Nov 26, 2017
a8df8e1
cythonizes training using skipgram with hierarchical softmax
manneshiva Nov 27, 2017
1e51a92
adds cython generated .c file
manneshiva Nov 27, 2017
838e833
resolves segmentation fault with multiple workers
manneshiva Nov 27, 2017
73e9176
fixes accuracy issues due to reference counts of word_subwords becomi…
manneshiva Nov 27, 2017
2d0e1a6
cythonizes fasttext cbow architecture
manneshiva Nov 28, 2017
772267a
cleans extra variables/values
manneshiva Nov 29, 2017
19a5da0
corrects parameters order for word_locks* in cbow
manneshiva Nov 30, 2017
9dafbd5
fixes indentation, unused imports and logging warning for slow version
manneshiva Dec 1, 2017
796ec91
splits long lines and removes redundant `import`/`else`
manneshiva Dec 1, 2017
5a4e627
minor: removes redundant `else`
manneshiva Dec 1, 2017
b471fde
adds docstring
manneshiva Dec 3, 2017
eee20ef
changes docstrings style, splits long lines
manneshiva Dec 4, 2017
b97e68c
Merge branch 'develop' into optimize_fasttext
manneshiva Dec 5, 2017
2dee387
Merge branch 'develop' into optimize_fasttext
manneshiva Dec 5, 2017
1b3e2d3
fix references in fasttext docstring
menshikh-iv Dec 5, 2017
81eebfb
adds deleted else in cbow-neg
manneshiva Dec 5, 2017
2560e1d
fixes docstring format
manneshiva Dec 6, 2017
7b8673a
adds missing docstrings
manneshiva Dec 7, 2017
144fab9
add missing __getitem__ to rst
menshikh-iv Dec 7, 2017
c3f5d74
add missing import to __init__ (`from gensim.models import FastText` …
menshikh-iv Dec 7, 2017
327f4ca
fix docs
menshikh-iv Dec 7, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ include gensim/models/word2vec_inner.pyx
include gensim/models/word2vec_inner.pxd
include gensim/models/doc2vec_inner.c
include gensim/models/doc2vec_inner.pyx
include gensim/models/fasttext_inner.c
include gensim/models/fasttext_inner.pyx
179 changes: 84 additions & 95 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,77 +9,83 @@
from gensim.models.word2vec import Word2Vec, train_sg_pair, train_cbow_pair
from gensim.models.wrappers.fasttext import FastTextKeyedVectors
from gensim.models.wrappers.fasttext import FastText as Ft_Wrapper, compute_ngrams, ft_hash
from gensim import matutils
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unused import


logger = logging.getLogger(__name__)

MAX_WORDS_IN_BATCH = 10000


def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window)
start = max(0, pos - model.window + reduced_window)
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]

word2_subwords = []
vocab_subwords_indices = []
ngrams_subwords_indices = []

for index in word2_indices:
vocab_subwords_indices += [index]
word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]]

for subword in word2_subwords:
ngrams_subwords_indices.append(model.wv.ngrams[subword])

l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size
l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size

l1 = np_sum([l1_vocab, l1_ngrams], axis=0)
subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices]
if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean:
l1 /= (len(subwords_indices[0]) + len(subwords_indices[1]))

# train on the sliding window for target word
train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True)
result += len(word_vocabs)
return result


def train_batch_sg(model, sentences, alpha, work=None):
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)

subwords_indices = [word.index]
word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]]

for subword in word2_subwords:
subwords_indices.append(model.wv.ngrams[subword])

for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
if pos2 != pos: # don't train on the `word` itself
train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True)

result += len(word_vocabs)
return result
try:
from gensim.models.fasttext_inner import train_batch_sg, train_batch_cbow
from gensim.models.fasttext_inner import FAST_VERSION, MAX_WORDS_IN_BATCH

except ImportError:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem with this solution is that it makes it impossible to test the Python native implementation. There's a number of ways you could fix this. The first thing that comes into my mind is this:

  • add another module fasttext_native or fasttext_legacy and define the native version of train_batch_sg, train_batch_cbow, FAST_VERSION, MAX_WORDS_IN_BATCH there
  • import as follows:
try:
    from gensim.models.fasttext_inner import train_batch_sg, train_batch_cbow, FAST_VERSION, MAX_WORDS_IN_BATCH
except ImportError:
    # log warning
    from gensim.models.fasttext_native import train_batch_sg, train_batch_cbow, FAST_VERSION, MAX_WORDS_IN_BATCH
  • add two new params to FastText.__init__() for train_batch_sg and train_batch_cbow functions. The default values would be the imported functions.

To test the native/legacy implementation, you would then be able to initialize FastText as follows:

from gensim.models.fasttext import FastText
from gensim.models.fasttext_native import MAX_WORDS_IN_BATCH, train_batch_sg, train_batch_cbow

ft = FastText(..., batch_words=MAX_WORDS_IN_BATCH, train_batch_sg=train_batch_sg, train_batch_cbow=train_batch_cbow)
# test ft

I don't insist on this particular solution. There may be more elegant ways. My main concern is that the native implementation should also be testable and covered by unit tests (just parametrize existing unit tests to test both the native and Cython implementation).

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Native Python implementations will be removed in the next refactor. We're not going to support them any more, now that @menshikh-iv can build good wheels.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hope that this will be true @piskvorky, need to distribute our wheels several releases in row first, and if all will be OK - we can remove python parts (but this process is not fast, I suggest not rushing with this)

Copy link
Contributor

@menshikh-iv menshikh-iv Nov 29, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Owner

@piskvorky piskvorky Nov 29, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I mean is there's no point introducing new constructs for testing both versions, when we know we only want one version.

# failed... fall back to plain numpy (20-80x slower training than the above)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

worth logging a warning?

FAST_VERSION = -1
MAX_WORDS_IN_BATCH = 10000

def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window)
start = max(0, pos - model.window + reduced_window)
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]

word2_subwords = []
vocab_subwords_indices = []
ngrams_subwords_indices = []

for index in word2_indices:
vocab_subwords_indices += [index]
word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]]

for subword in word2_subwords:
ngrams_subwords_indices.append(model.wv.ngrams[subword])

l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size
l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size

l1 = np_sum([l1_vocab, l1_ngrams], axis=0)
subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices]
if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean:
l1 /= (len(subwords_indices[0]) + len(subwords_indices[1]))

train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) # train on the sliding window for target word
result += len(word_vocabs)
return result

def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)

subwords_indices = [word.index]
word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]]

for subword in word2_subwords:
subwords_indices.append(model.wv.ngrams[subword])

for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
if pos2 != pos: # don't train on the `word` itself
train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True)

result += len(word_vocabs)
return result


class FastText(Word2Vec):
def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, word_ngrams=1, loss='ns', sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6,
sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH):
def __init__(
self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, word_ngrams=1, loss='ns', sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The loss parameter is unused.

negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000,
trim_rule=None, batch_words=MAX_WORDS_IN_BATCH):

# fastText specific params
self.bucket = bucket
Expand All @@ -89,12 +95,10 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5,
if self.word_ngrams <= 1 and self.max_n == 0:
self.bucket = 0

super(FastText, self).__init__(
sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count,
super(FastText, self).__init__(sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PEP8: Arguments on first line forbidden when not using vertical alignment.
https://www.python.org/dev/peps/pep-0008/#indentation

max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
sg=sg, hs=hs, negative=negative, cbow_mean=cbow_mean, hashfxn=hashfxn, iter=iter, null_word=null_word,
trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words
)
trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words)

def initialize_word_vectors(self):
self.wv = FastTextKeyedVectors()
Expand All @@ -104,16 +108,13 @@ def initialize_word_vectors(self):
def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False):
if update:
if not len(self.wv.vocab):
raise RuntimeError(
"You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
"First build the vocabulary of your model with a corpus before doing an online update."
)
raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

incorrect indentation

"First build the vocabulary of your model with a corpus "
"before doing an online update.")
self.old_vocab_len = len(self.wv.vocab)
self.old_hash2index_len = len(self.wv.hash2index)

super(FastText, self).build_vocab(
sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update
)
super(FastText, self).build_vocab(sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update)
self.init_ngrams(update=update)

def init_ngrams(self, update=False):
Expand Down Expand Up @@ -170,18 +171,10 @@ def init_ngrams(self, update=False):

rand_obj = np.random
rand_obj.seed(self.seed)
new_vocab_rows = rand_obj.uniform(
-1.0 / self.vector_size, 1.0 / self.vector_size,
(len(self.wv.vocab) - self.old_vocab_len, self.vector_size)
)
new_vocab_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the point of reformatting this and the other lines? It makes the lines too long and the code is harder to read.

new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL)
new_ngram_rows = rand_obj.uniform(
-1.0 / self.vector_size, 1.0 / self.vector_size,
(len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)
)
new_ngram_lockf_rows = ones(
(len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL
)
new_ngram_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size))
new_ngram_lockf_rows = ones((len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL)

self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows])
self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows])
Expand All @@ -192,19 +185,15 @@ def reset_ngram_weights(self):
rand_obj = np.random
rand_obj.seed(self.seed)
for index in range(len(self.wv.vocab)):
self.wv.syn0_vocab[index] = rand_obj.uniform(
-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size
)
self.wv.syn0_vocab[index] = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size)
for index in range(len(self.wv.hash2index)):
self.wv.syn0_ngrams[index] = rand_obj.uniform(
-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size
)
self.wv.syn0_ngrams[index] = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size)

def _do_train_job(self, sentences, alpha, inits):
work, neu1 = inits
tally = 0
if self.sg:
tally += train_batch_sg(self, sentences, alpha, work)
tally += train_batch_sg(self, sentences, alpha, work, neu1)
else:
tally += train_batch_cbow(self, sentences, alpha, work, neu1)

Expand Down Expand Up @@ -245,4 +234,4 @@ def load_fasttext_format(cls, *args, **kwargs):

def save(self, *args, **kwargs):
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm'])
super(FastText, self).save(*args, **kwargs)
super(FastText, self).save(*args, **kwargs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PEP8: No newline at the end of file.

Loading