Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor changes in fresh PRs (pep8, misprints, etc) #1369

Merged
merged 14 commits into from
May 29, 2017
12 changes: 11 additions & 1 deletion gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,17 @@ def jaccard(vec1, vec2):


def jaccard_set(set1, set2):
return 1. - float(len(set1 & set2)) / float(len(set1 | set2))
"""
A distance metric between set representation.
Returns 1 minus the intersection divided by union.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PEP257: docstrings in imperative mode ("do X, return Y", not "does X, returns Y").

Returns a value in range <0, 1> where values closer to 0 mean less distance and thus higher similarity.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

less => smaller.

I see this is a potential gotcha for our users, because other metrics in gensim are similarity (not distance). How about we put this info directly into the method name -- jaccard_distance instead of jaccard_set?

"""

union_cardinality = len(set1 | set2)
if union_cardinality == 0: # Both sets are empty
return 1.

return 1. - float(len(set1 & set2)) / float(union_cardinality)


def dirichlet_expectation(alpha):
Expand Down
10 changes: 5 additions & 5 deletions gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@

from __future__ import with_statement

import logging, time
import logging
import time
import warnings
import numpy as np
from scipy.special import gammaln, psi # gamma function utils

Expand Down Expand Up @@ -614,16 +616,14 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):

def print_topic(self, topic_id, topn= None, num_words=None):
if num_words is not None: # deprecated num_words is used
logger.warning("The parameter num_words for print_topic() would be deprecated in the updated version.")
logger.warning("Please use topn instead.")
warnings.warn("The parameter num_words for print_topic() would be deprecated in the updated version. Please use topn instead.")
topn = num_words

return self.show_topic(topic_id, topn, formatted=True)

def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words= None,):
if num_words is not None: # deprecated num_words is used
logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.")
logger.warning("Please use topn instead.")
warnings.warn("The parameter num_words for show_topic() would be deprecated in the updated version. Please use topn instead.")
topn = num_words

lambdak = list(self.data[topic_id, :])
Expand Down
14 changes: 8 additions & 6 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -989,9 +989,11 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10,
>>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2`
"""

distances = {"kulback_leibler": kullback_leibler,
"hellinger": hellinger,
"jaccard": jaccard_set}
distances = {
"kulback_leibler": kullback_leibler,
"hellinger": hellinger,
"jaccard": jaccard_set,
}

if distance not in distances:
valid_keys = ", ".join("`{}`".format(x) for x in distances.keys())
Expand Down Expand Up @@ -1019,7 +1021,7 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10,
if np.abs(np.max(z)) > 1e-8:
z /= np.max(z)

annotation = [[None for _ in range(t1_size)] for _ in range(t2_size)]
annotation = [[None] * t1_size for _ in range(t2_size)]

for topic1 in range(t1_size):
for topic2 in range(t2_size):
Expand Down Expand Up @@ -1118,7 +1120,7 @@ def load(cls, fname, *args, **kwargs):
kwargs['mmap'] = kwargs.get('mmap', None)
result = super(LdaModel, cls).load(fname, *args, **kwargs)

# check if `random_state` attribute has been set after main pickel load
# check if `random_state` attribute has been set after main pickle load
# if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim
# if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so set `random_state` as the default value
if not hasattr(result, 'random_state'):
Expand All @@ -1134,7 +1136,7 @@ def load(cls, fname, *args, **kwargs):
id2word_fname = utils.smart_extension(fname, '.id2word')
# check if `id2word_fname` file is present on disk
# if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, so set `result.id2word` using the `id2word_fname` file
# if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickel load
# if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickle load
if (os.path.isfile(id2word_fname)):
try:
result.id2word = utils.unpickle(id2word_fname)
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/wrappers/dtmmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import logging
import random
import warnings
import tempfile
import os
from subprocess import PIPE
Expand Down Expand Up @@ -93,7 +94,7 @@ def __init__(
lencorpus = sum(1 for _ in corpus)
if lencorpus == 0:
raise ValueError("cannot compute DTM over an empty corpus")
if model == "fixed" and any([i == 0 for i in [len(text) for text in corpus]]):
if model == "fixed" and any(not text for text in corpus):
raise ValueError("""There is a text without words in the input corpus.
This breaks method='fixed' (The DIM model).""")
if lencorpus != sum(time_slices):
Expand Down Expand Up @@ -308,8 +309,7 @@ def show_topic(self, topicid, time, topn=50, num_words=None):
def print_topic(self, topicid, time, topn=10, num_words=None):
"""Return the given topic, formatted as a string."""
if num_words is not None: # deprecated num_words is used
logger.warning("The parameter num_words for print_topic(() would be deprecated in the updated version.")
logger.warning("Please use topn instead.")
warnings.warn("The parameter num_words for print_topic() would be deprecated in the updated version. Please use topn instead.")
topn = num_words

return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, topn)])
Expand Down
4 changes: 2 additions & 2 deletions gensim/models/wrappers/ldamallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from smart_open import smart_open

from gensim import utils, matutils
from gensim.utils import check_output
from gensim.utils import check_output, revdict
from gensim.models.ldamodel import LdaModel
from gensim.models import basemodel

Expand Down Expand Up @@ -190,7 +190,7 @@ def load_word_topics(self):
if hasattr(self.id2word, 'token2id'):
word2id = self.id2word.token2id
else:
word2id = dict((v, k) for k, v in iteritems(dict(self.id2word)))
word2id = revdict(self.id2word)

with utils.smart_open(self.fstate()) as fin:
_ = next(fin) # header
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/wrappers/wordrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
`wr_path` is the path to the Wordrank directory.
`corpus_file` is the filename of the text file to be used for training the Wordrank model.
Expects file to contain space-separated tokens in a single line
`out_name` is name of the directory which will be created(in wordrank folder) to save embeddings and training data.
`out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data.
`size` is the dimensionality of the feature vectors.
`window` is the number of context words to the left (and to the right, if symmetric = 1).
`symmetric` if 0, only use left context words, else use left and right both.
Expand Down Expand Up @@ -98,13 +98,13 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
input_fnames = [corpus_file.split('/')[-1], corpus_file.split('/')[-1], cooccurrence_file]
output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file]

logger.info("Prepare training data using glove code")
logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames))
for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames):
with smart_open(input_fname, 'rb') as r:
with smart_open(output_fname, 'wb') as w:
utils.check_output(w, args=command, stdin=r)

logger.info("Delete frequencies from vocab file")
logger.info("Deleting frequencies from vocab file")
with smart_open(vocab_file, 'wb') as w:
utils.check_output(w, args=cmd_del_vocab_freq)

Expand Down
95 changes: 49 additions & 46 deletions gensim/test/test_sklearn_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,28 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn import linear_model
except:
except ImportError:
raise unittest.SkipTest("Test requires scikit-learn to be installed, which is not available")

from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel
from gensim.sklearn_integration.sklearn_wrapper_gensim_lsimodel import SklearnWrapperLsiModel
from gensim.corpora import Dictionary
from gensim import matutils

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)

texts = [['complier', 'system', 'computer'],
['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
['graph', 'flow', 'network', 'graph'],
['loading', 'computer', 'system'],
['user', 'server', 'system'],
['tree', 'hamiltonian'],
['graph', 'trees'],
['computer', 'kernel', 'malfunction', 'computer'],
['server', 'system', 'computer']]
texts = [
['complier', 'system', 'computer'],
['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
['graph', 'flow', 'network', 'graph'],
['loading', 'computer', 'system'],
['user', 'server', 'system'],
['tree', 'hamiltonian'],
['graph', 'trees'],
['computer', 'kernel', 'malfunction', 'computer'],
['server', 'system', 'computer'],
]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

Expand All @@ -48,70 +50,71 @@ def testPrintTopic(self):
self.assertTrue(isinstance(k, int))

def testTransform(self):
texts_new = ['graph','eulerian']
texts_new = ['graph', 'eulerian']
bow = self.model.id2word.doc2bow(texts_new)
X = self.model.transform(bow)
self.assertTrue(X.shape[0], 1)
self.assertTrue(X.shape[1], self.model.num_topics)
texts_new = [['graph','eulerian'],['server', 'flow'], ['path', 'system']]
matrix = self.model.transform(bow)
self.assertTrue(matrix.shape[0], 1)
self.assertTrue(matrix.shape[1], self.model.num_topics)
texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']]
bow = []
for i in texts_new:
bow.append(self.model.id2word.doc2bow(i))
X = self.model.transform(bow)
self.assertTrue(X.shape[0], 3)
self.assertTrue(X.shape[1], self.model.num_topics)
matrix = self.model.transform(bow)
self.assertTrue(matrix.shape[0], 3)
self.assertTrue(matrix.shape[1], self.model.num_topics)

def testGetTopicDist(self):
texts_new = ['graph','eulerian']
texts_new = ['graph', 'eulerian']
bow = self.model.id2word.doc2bow(texts_new)
doc_topics, word_topics, phi_values = self.model.get_topic_dist(bow,per_word_topics=True)
doc_topics, word_topics, phi_values = self.model.get_topic_dist(bow, per_word_topics=True)

for k,v in word_topics:
for k, v in word_topics:
self.assertTrue(isinstance(v, list))
self.assertTrue(isinstance(k, int))
for k,v in doc_topics:
for k, v in doc_topics:
self.assertTrue(isinstance(v, float))
self.assertTrue(isinstance(k, int))
for k,v in phi_values:
for k, v in phi_values:
self.assertTrue(isinstance(v, list))
self.assertTrue(isinstance(k, int))

def testPartialFit(self):
for i in range(10):
self.model.partial_fit(X=corpus) # fit against the model again
doc=list(corpus)[0] # transform only the first document
doc = list(corpus)[0] # transform only the first document
transformed = self.model[doc]
transformed_approx = matutils.sparse2full(transformed, 2) # better approximation
expected=[0.13, 0.87]
expected = [0.13, 0.87]
passed = numpy.allclose(sorted(transformed_approx), sorted(expected), atol=1e-1)
self.assertTrue(passed)

def testCSRMatrixConversion(self):
Arr = numpy.array([[1, 2, 0], [0, 0, 3], [1, 0, 0]])
sArr = sparse.csr_matrix(Arr)
arr = numpy.array([[1, 2, 0], [0, 0, 3], [1, 0, 0]])
sarr = sparse.csr_matrix(arr)
newmodel = SklearnWrapperLdaModel(num_topics=2, passes=100)
newmodel.fit(sArr)
newmodel.fit(sarr)
topic = newmodel.print_topics()
for k, v in topic:
self.assertTrue(isinstance(v, six.string_types))
self.assertTrue(isinstance(k, int))

def testPipeline(self):
model = SklearnWrapperLdaModel(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0))
with open(datapath('mini_newsgroup'),'rb') as f:
with open(datapath('mini_newsgroup'), 'rb') as f:
compressed_content = f.read()
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
id2word=Dictionary(map(lambda x : x.split(), data.data))
id2word = Dictionary(map(lambda x: x.split(), data.data))
corpus = [id2word.doc2bow(i.split()) for i in data.data]
rand = numpy.random.mtrand.RandomState(1) # set seed for getting same result
clf=linear_model.LogisticRegression(penalty='l2', C=0.1)
numpy.random.mtrand.RandomState(1) # set seed for getting same result
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
text_lda = Pipeline((('features', model,), ('classifier', clf)))
text_lda.fit(corpus, data.target)
score = text_lda.score(corpus, data.target)
self.assertGreater(score, 0.40)


class TestSklearnLSIWrapper(unittest.TestCase):
def setUp(self):
self.model = SklearnWrapperLsiModel(id2word=dictionary, num_topics=2)
Expand All @@ -124,39 +127,39 @@ def testModelSanity(self):
self.assertTrue(isinstance(k, int))

def testTransform(self):
texts_new = ['graph','eulerian']
texts_new = ['graph', 'eulerian']
bow = self.model.id2word.doc2bow(texts_new)
X = self.model.transform(bow)
self.assertTrue(X.shape[0], 1)
self.assertTrue(X.shape[1], self.model.num_topics)
texts_new = [['graph','eulerian'],['server', 'flow'], ['path', 'system']]
matrix = self.model.transform(bow)
self.assertTrue(matrix.shape[0], 1)
self.assertTrue(matrix.shape[1], self.model.num_topics)
texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']]
bow = []
for i in texts_new:
bow.append(self.model.id2word.doc2bow(i))
X = self.model.transform(bow)
self.assertTrue(X.shape[0], 3)
self.assertTrue(X.shape[1], self.model.num_topics)
matrix = self.model.transform(bow)
self.assertTrue(matrix.shape[0], 3)
self.assertTrue(matrix.shape[1], self.model.num_topics)

def testPartialFit(self):
for i in range(10):
self.model.partial_fit(X=corpus) # fit against the model again
doc=list(corpus)[0] # transform only the first document
doc = list(corpus)[0] # transform only the first document
transformed = self.model[doc]
transformed_approx = matutils.sparse2full(transformed, 2) # better approximation
expected=[1.39, 0.0]
expected = [1.39, 0.0]
passed = numpy.allclose(sorted(transformed_approx), sorted(expected), atol=1e-1)
self.assertTrue(passed)

def testPipeline(self):
model = SklearnWrapperLsiModel(num_topics=2)
with open(datapath('mini_newsgroup'),'rb') as f:
with open(datapath('mini_newsgroup'), 'rb') as f:
compressed_content = f.read()
uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
cache = pickle.loads(uncompressed_content)
data = cache
id2word=Dictionary(map(lambda x : x.split(), data.data))
id2word = Dictionary(map(lambda x: x.split(), data.data))
corpus = [id2word.doc2bow(i.split()) for i in data.data]
clf=linear_model.LogisticRegression(penalty='l2', C=0.1)
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
text_lda = Pipeline((('features', model,), ('classifier', clf)))
text_lda.fit(corpus, data.target)
score = text_lda.score(corpus, data.target)
Expand Down
20 changes: 11 additions & 9 deletions gensim/test/test_tmdiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,17 @@

class TestLdaDiff(unittest.TestCase):
def setUp(self):
texts = [['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
texts = [
['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey'],
]
self.dictionary = Dictionary(texts)
self.corpus = [self.dictionary.doc2bow(text) for text in texts]
self.num_topics = 5
Expand Down
4 changes: 2 additions & 2 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -943,7 +943,7 @@ def revdict(d):
result (which one is kept is arbitrary).

"""
return dict((v, k) for (k, v) in iteritems(d))
return dict((v, k) for (k, v) in iteritems(dict(d)))


def toptexts(query, texts, index, n=10):
Expand Down Expand Up @@ -1164,7 +1164,7 @@ def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs):
Added extra KeyboardInterrupt handling
"""
try:
logger.debug("COMMAND: %s %s", str(popenargs), str(kwargs))
logger.debug("COMMAND: %s %s", popenargs, kwargs)
process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
Expand Down
Loading