Refactor line2doc methods of LowCorpus and MalletCorpus (#2269)

* Refactor to more optimal line2doc method of LowCorpus and MalletCorpus * fix build * add tests * fix build
piskvorky · Jan 11, 2019 · 680de8d · 680de8d
1 parent 3a7760a
commit 680de8d
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 14 deletions.
diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
@@ -10,10 +10,10 @@
 from __future__ import with_statement
 
 import logging
+from collections import Counter
 
 from gensim import utils
 from gensim.corpora import IndexedCorpus
-from six import iterkeys
 from six.moves import zip, range
 
 
@@ -159,25 +159,24 @@ def line2doc(self, line):
         words = self.line2words(line)
 
         if self.use_wordids:
-            # get all distinct terms in this document, ignore unknown words
-            uniq_words = set(words).intersection(iterkeys(self.word2id))
-
             # the following creates a unique list of words *in the same order*
             # as they were in the input. when iterating over the documents,
             # the (word, count) pairs will appear in the same order as they
             # were in the input (bar duplicates), which looks better.
             # if this was not needed, we might as well have used useWords = set(words)
-            use_words, marker = [], set()
+            use_words, counts = [], Counter()
             for word in words:
-                if (word in uniq_words) and (word not in marker):
+                if word not in self.word2id:
+                    continue
+                if word not in counts:
                     use_words.append(word)
-                    marker.add(word)
+                counts[word] += 1
             # construct a list of (wordIndex, wordFrequency) 2-tuples
-            doc = [(self.word2id.get(w), words.count(w)) for w in use_words]
+            doc = [(self.word2id[w], counts[w]) for w in use_words]
         else:
-            uniq_words = set(words)
+            word_freqs = Counter(words)
             # construct a list of (word, wordFrequency) 2-tuples
-            doc = [(w, words.count(w)) for w in uniq_words]
+            doc = list(word_freqs.items())
 
         # return the document, then forget it and move on to the next one
         # note that this way, only one doc is stored in memory at a time, not the whole corpus

diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
@@ -125,10 +125,11 @@ def line2doc(self, line):
             [(3, 1), (4, 1)]
 
         """
-        splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word]
-        docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:]
+        split_line = utils.to_unicode(line).strip().split(None, 2)
+        docid, doclang = split_line[0], split_line[1]
+        words = split_line[2] if len(split_line) >= 3 else ''
 
-        doc = super(MalletCorpus, self).line2doc(' '.join(words))
+        doc = super(MalletCorpus, self).line2doc(words)
 
         if self.metadata:
             return doc, (docid, doclang)

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
@@ -420,11 +420,30 @@ def test_save_format_for_dtm(self):
 
 class TestLowCorpus(CorpusTestCase):
     TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
+    CORPUS_LINE = 'mom  wash  window window was washed'
 
     def setUp(self):
         self.corpus_class = lowcorpus.LowCorpus
         self.file_extension = '.low'
 
+    def test_line2doc(self):
+        fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
+        id2word = {1: 'mom', 2: 'window'}
+
+        corpus = self.corpus_class(fname, id2word=id2word)
+
+        # should return all words in doc
+        corpus.use_wordids = False
+        self.assertEqual(
+            sorted(corpus.line2doc(self.CORPUS_LINE)),
+            [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)])
+
+        # should return words in word2id
+        corpus.use_wordids = True
+        self.assertEqual(
+            sorted(corpus.line2doc(self.CORPUS_LINE)),
+            [(1, 1), (2, 2)])
+
 
 class TestUciCorpus(CorpusTestCase):
     TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
@@ -438,8 +457,9 @@ def test_serialize_compressed(self):
         pass
 
 
-class TestMalletCorpus(CorpusTestCase):
+class TestMalletCorpus(TestLowCorpus):
     TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
+    CORPUS_LINE = '#3  lang mom  wash  window window was washed'
 
     def setUp(self):
         self.corpus_class = malletcorpus.MalletCorpus
@@ -459,6 +479,35 @@ def test_load_with_metadata(self):
             self.assertEqual(metadata[0], str(i + 1))
             self.assertEqual(metadata[1], 'en')
 
+    def test_line2doc(self):
+        # case with metadata=False (by default)
+        super(TestMalletCorpus, self).test_line2doc()
+
+        # case with metadata=True
+        fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
+        id2word = {1: 'mom', 2: 'window'}
+
+        corpus = self.corpus_class(fname, id2word=id2word, metadata=True)
+
+        # should return all words in doc
+        corpus.use_wordids = False
+        doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE)
+        self.assertEqual(docid, '#3')
+        self.assertEqual(doclang, 'lang')
+        self.assertEqual(
+            sorted(doc),
+            [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)])
+
+        # should return words in word2id
+        corpus.use_wordids = True
+        doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE)
+
+        self.assertEqual(docid, '#3')
+        self.assertEqual(doclang, 'lang')
+        self.assertEqual(
+            sorted(doc),
+            [(1, 1), (2, 2)])
+
 
 class TestTextCorpus(CorpusTestCase):