From 680de8d4f35325e7486c07c4e06422929e826b57 Mon Sep 17 00:00:00 2001 From: horpto <__Singleton__@hackerdom.ru> Date: Fri, 11 Jan 2019 11:43:06 +0500 Subject: [PATCH] Refactor `line2doc` methods of `LowCorpus` and `MalletCorpus` (#2269) * Refactor to more optimal line2doc method of LowCorpus and MalletCorpus * fix build * add tests * fix build --- gensim/corpora/lowcorpus.py | 19 ++++++------- gensim/corpora/malletcorpus.py | 7 +++-- gensim/test/test_corpora.py | 51 +++++++++++++++++++++++++++++++++- 3 files changed, 63 insertions(+), 14 deletions(-) diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index 2944aafd27..c67c34b700 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -10,10 +10,10 @@ from __future__ import with_statement import logging +from collections import Counter from gensim import utils from gensim.corpora import IndexedCorpus -from six import iterkeys from six.moves import zip, range @@ -159,25 +159,24 @@ def line2doc(self, line): words = self.line2words(line) if self.use_wordids: - # get all distinct terms in this document, ignore unknown words - uniq_words = set(words).intersection(iterkeys(self.word2id)) - # the following creates a unique list of words *in the same order* # as they were in the input. when iterating over the documents, # the (word, count) pairs will appear in the same order as they # were in the input (bar duplicates), which looks better. # if this was not needed, we might as well have used useWords = set(words) - use_words, marker = [], set() + use_words, counts = [], Counter() for word in words: - if (word in uniq_words) and (word not in marker): + if word not in self.word2id: + continue + if word not in counts: use_words.append(word) - marker.add(word) + counts[word] += 1 # construct a list of (wordIndex, wordFrequency) 2-tuples - doc = [(self.word2id.get(w), words.count(w)) for w in use_words] + doc = [(self.word2id[w], counts[w]) for w in use_words] else: - uniq_words = set(words) + word_freqs = Counter(words) # construct a list of (word, wordFrequency) 2-tuples - doc = [(w, words.count(w)) for w in uniq_words] + doc = list(word_freqs.items()) # return the document, then forget it and move on to the next one # note that this way, only one doc is stored in memory at a time, not the whole corpus diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index f41a4456e3..c9a82fffbb 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -125,10 +125,11 @@ def line2doc(self, line): [(3, 1), (4, 1)] """ - splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word] - docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:] + split_line = utils.to_unicode(line).strip().split(None, 2) + docid, doclang = split_line[0], split_line[1] + words = split_line[2] if len(split_line) >= 3 else '' - doc = super(MalletCorpus, self).line2doc(' '.join(words)) + doc = super(MalletCorpus, self).line2doc(words) if self.metadata: return doc, (docid, doclang) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 5efc8f2b3f..0b8c3c97bd 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -420,11 +420,30 @@ def test_save_format_for_dtm(self): class TestLowCorpus(CorpusTestCase): TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []] + CORPUS_LINE = 'mom wash window window was washed' def setUp(self): self.corpus_class = lowcorpus.LowCorpus self.file_extension = '.low' + def test_line2doc(self): + fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) + id2word = {1: 'mom', 2: 'window'} + + corpus = self.corpus_class(fname, id2word=id2word) + + # should return all words in doc + corpus.use_wordids = False + self.assertEqual( + sorted(corpus.line2doc(self.CORPUS_LINE)), + [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)]) + + # should return words in word2id + corpus.use_wordids = True + self.assertEqual( + sorted(corpus.line2doc(self.CORPUS_LINE)), + [(1, 1), (2, 2)]) + class TestUciCorpus(CorpusTestCase): TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []] @@ -438,8 +457,9 @@ def test_serialize_compressed(self): pass -class TestMalletCorpus(CorpusTestCase): +class TestMalletCorpus(TestLowCorpus): TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []] + CORPUS_LINE = '#3 lang mom wash window window was washed' def setUp(self): self.corpus_class = malletcorpus.MalletCorpus @@ -459,6 +479,35 @@ def test_load_with_metadata(self): self.assertEqual(metadata[0], str(i + 1)) self.assertEqual(metadata[1], 'en') + def test_line2doc(self): + # case with metadata=False (by default) + super(TestMalletCorpus, self).test_line2doc() + + # case with metadata=True + fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) + id2word = {1: 'mom', 2: 'window'} + + corpus = self.corpus_class(fname, id2word=id2word, metadata=True) + + # should return all words in doc + corpus.use_wordids = False + doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE) + self.assertEqual(docid, '#3') + self.assertEqual(doclang, 'lang') + self.assertEqual( + sorted(doc), + [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)]) + + # should return words in word2id + corpus.use_wordids = True + doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE) + + self.assertEqual(docid, '#3') + self.assertEqual(doclang, 'lang') + self.assertEqual( + sorted(doc), + [(1, 1), (2, 2)]) + class TestTextCorpus(CorpusTestCase):