Skip to content

Commit

Permalink
Refactor line2doc methods of LowCorpus and MalletCorpus (#2269)
Browse files Browse the repository at this point in the history
* Refactor to more optimal line2doc method of LowCorpus and MalletCorpus

* fix build

* add tests

* fix build
  • Loading branch information
horpto authored and menshikh-iv committed Jan 11, 2019
1 parent 3a7760a commit 680de8d
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 14 deletions.
19 changes: 9 additions & 10 deletions gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
from __future__ import with_statement

import logging
from collections import Counter

from gensim import utils
from gensim.corpora import IndexedCorpus
from six import iterkeys
from six.moves import zip, range


Expand Down Expand Up @@ -159,25 +159,24 @@ def line2doc(self, line):
words = self.line2words(line)

if self.use_wordids:
# get all distinct terms in this document, ignore unknown words
uniq_words = set(words).intersection(iterkeys(self.word2id))

# the following creates a unique list of words *in the same order*
# as they were in the input. when iterating over the documents,
# the (word, count) pairs will appear in the same order as they
# were in the input (bar duplicates), which looks better.
# if this was not needed, we might as well have used useWords = set(words)
use_words, marker = [], set()
use_words, counts = [], Counter()
for word in words:
if (word in uniq_words) and (word not in marker):
if word not in self.word2id:
continue
if word not in counts:
use_words.append(word)
marker.add(word)
counts[word] += 1
# construct a list of (wordIndex, wordFrequency) 2-tuples
doc = [(self.word2id.get(w), words.count(w)) for w in use_words]
doc = [(self.word2id[w], counts[w]) for w in use_words]
else:
uniq_words = set(words)
word_freqs = Counter(words)
# construct a list of (word, wordFrequency) 2-tuples
doc = [(w, words.count(w)) for w in uniq_words]
doc = list(word_freqs.items())

# return the document, then forget it and move on to the next one
# note that this way, only one doc is stored in memory at a time, not the whole corpus
Expand Down
7 changes: 4 additions & 3 deletions gensim/corpora/malletcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,11 @@ def line2doc(self, line):
[(3, 1), (4, 1)]
"""
splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word]
docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:]
split_line = utils.to_unicode(line).strip().split(None, 2)
docid, doclang = split_line[0], split_line[1]
words = split_line[2] if len(split_line) >= 3 else ''

doc = super(MalletCorpus, self).line2doc(' '.join(words))
doc = super(MalletCorpus, self).line2doc(words)

if self.metadata:
return doc, (docid, doclang)
Expand Down
51 changes: 50 additions & 1 deletion gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,11 +420,30 @@ def test_save_format_for_dtm(self):

class TestLowCorpus(CorpusTestCase):
TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
CORPUS_LINE = 'mom wash window window was washed'

def setUp(self):
self.corpus_class = lowcorpus.LowCorpus
self.file_extension = '.low'

def test_line2doc(self):
fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
id2word = {1: 'mom', 2: 'window'}

corpus = self.corpus_class(fname, id2word=id2word)

# should return all words in doc
corpus.use_wordids = False
self.assertEqual(
sorted(corpus.line2doc(self.CORPUS_LINE)),
[('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)])

# should return words in word2id
corpus.use_wordids = True
self.assertEqual(
sorted(corpus.line2doc(self.CORPUS_LINE)),
[(1, 1), (2, 2)])


class TestUciCorpus(CorpusTestCase):
TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
Expand All @@ -438,8 +457,9 @@ def test_serialize_compressed(self):
pass


class TestMalletCorpus(CorpusTestCase):
class TestMalletCorpus(TestLowCorpus):
TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
CORPUS_LINE = '#3 lang mom wash window window was washed'

def setUp(self):
self.corpus_class = malletcorpus.MalletCorpus
Expand All @@ -459,6 +479,35 @@ def test_load_with_metadata(self):
self.assertEqual(metadata[0], str(i + 1))
self.assertEqual(metadata[1], 'en')

def test_line2doc(self):
# case with metadata=False (by default)
super(TestMalletCorpus, self).test_line2doc()

# case with metadata=True
fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
id2word = {1: 'mom', 2: 'window'}

corpus = self.corpus_class(fname, id2word=id2word, metadata=True)

# should return all words in doc
corpus.use_wordids = False
doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE)
self.assertEqual(docid, '#3')
self.assertEqual(doclang, 'lang')
self.assertEqual(
sorted(doc),
[('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)])

# should return words in word2id
corpus.use_wordids = True
doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE)

self.assertEqual(docid, '#3')
self.assertEqual(doclang, 'lang')
self.assertEqual(
sorted(doc),
[(1, 1), (2, 2)])


class TestTextCorpus(CorpusTestCase):

Expand Down

0 comments on commit 680de8d

Please sign in to comment.