-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reduce Phraser
memory usage (drop frequencies)
#2208
Changes from 7 commits
242c80e
bba2e46
9f9b05f
c391fe5
d154e3a
40b6672
40dcbde
21c3911
80e9222
9943909
021226a
9de2495
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -208,6 +208,14 @@ def load(cls, *args, **kwargs): | |
""" | ||
model = super(PhrasesTransformation, cls).load(*args, **kwargs) | ||
# update older models | ||
# if value in phrasegrams dict is a tuple, load only the scores. | ||
try: | ||
for components, scores in model.__dict__['phrasegrams'].items(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this really work? It mutates the collection it's iterating over, which is usually a bad idea. As mentioned previously, I'd make a copy of the original keys (not items) and iterate over that, while mutating the original (large) dict. |
||
if isinstance(scores, tuple): | ||
model.__dict__['phrasegrams'][components] = scores[1] | ||
except KeyError: | ||
pass | ||
|
||
# if no scoring parameter, use default scoring | ||
if not hasattr(model, 'scoring'): | ||
logger.info('older version of %s loaded without scoring function', cls.__name__) | ||
|
@@ -805,7 +813,7 @@ def __init__(self, phrases_model): | |
for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True): | ||
if bigram in self.phrasegrams: | ||
logger.info('Phraser repeat %s', bigram) | ||
self.phrasegrams[bigram] = (phrases_model.vocab[self.delimiter.join(bigram)], score) | ||
self.phrasegrams[bigram] = score | ||
count += 1 | ||
if not count % 50000: | ||
logger.info('Phraser added %i phrasegrams', count) | ||
|
@@ -848,7 +856,7 @@ def score_item(self, worda, wordb, components, scorer): | |
|
||
""" | ||
try: | ||
return self.phrasegrams[tuple(components)][1] | ||
return self.phrasegrams[tuple(components)] | ||
except KeyError: | ||
return -1 | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,6 @@ | |
import unittest | ||
|
||
import six | ||
|
||
import numpy as np | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keep the blank please. We separate third party modules from internal modules by a blank line (another visual block). The blank line between |
||
from gensim.utils import to_unicode | ||
|
@@ -646,6 +645,16 @@ def testEncoding(self): | |
self.assertTrue(isinstance(transformed, six.text_type)) | ||
|
||
|
||
class TestPhraserModelCompatibilty(unittest.TestCase): | ||
|
||
def testCompatibilty(self): | ||
bigram_loaded = Phraser.load(datapath("phraser_model_3dot6")) | ||
test_sentences = [u'trees', u'graph', u'minors'] | ||
prev_ver = bigram_loaded[test_sentences] | ||
expected_res = ['trees_graph', 'minors'] | ||
self.assertEqual(prev_ver, expected_res) | ||
|
||
|
||
if __name__ == '__main__': | ||
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) | ||
unittest.main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why this strange
__dict__
access? Why not use the pattern I showed in my last review?And what is the
try
for, whatKeyError
are we guarding against? Please add code comments.