diff --git a/docs/notebooks/sklearn_wrapper.ipynb b/docs/notebooks/sklearn_wrapper.ipynb index e98047dedc..cc5e85d3a2 100644 --- a/docs/notebooks/sklearn_wrapper.ipynb +++ b/docs/notebooks/sklearn_wrapper.ipynb @@ -65,15 +65,17 @@ "outputs": [], "source": [ "from gensim.corpora import Dictionary\n", - "texts = [['complier', 'system', 'computer'],\n", - " ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],\n", - " ['graph', 'flow', 'network', 'graph'],\n", - " ['loading', 'computer', 'system'],\n", - " ['user', 'server', 'system'],\n", - " ['tree','hamiltonian'],\n", - " ['graph', 'trees'],\n", - " ['computer', 'kernel', 'malfunction','computer'],\n", - " ['server','system','computer']]\n", + "texts = [\n", + " ['complier', 'system', 'computer'],\n", + " ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],\n", + " ['graph', 'flow', 'network', 'graph'],\n", + " ['loading', 'computer', 'system'],\n", + " ['user', 'server', 'system'],\n", + " ['tree', 'hamiltonian'],\n", + " ['graph', 'trees'],\n", + " ['computer', 'kernel', 'malfunction', 'computer'],\n", + " ['server', 'system', 'computer']\n", + "]\n", "dictionary = Dictionary(texts)\n", "corpus = [dictionary.doc2bow(text) for text in texts]" ] @@ -119,7 +121,7 @@ } ], "source": [ - "model=SklearnWrapperLdaModel(num_topics=2,id2word=dictionary,iterations=20, random_state=1)\n", + "model=SklearnWrapperLdaModel(num_topics=2, id2word=dictionary, iterations=20, random_state=1)\n", "model.fit(corpus)\n", "model.print_topics(2)\n", "model.transform(corpus)" @@ -167,9 +169,7 @@ "source": [ "rand = np.random.mtrand.RandomState(1) # set seed for getting same result\n", "cats = ['rec.sport.baseball', 'sci.crypt']\n", - "data = fetch_20newsgroups(subset='train',\n", - " categories=cats,\n", - " shuffle=True)" + "data = fetch_20newsgroups(subset='train', categories=cats, shuffle=True)" ] }, { @@ -190,9 +190,9 @@ "vec = CountVectorizer(min_df=10, stop_words='english')\n", "\n", "X = vec.fit_transform(data.data)\n", - "vocab = vec.get_feature_names() #vocab to be converted to id2word \n", + "vocab = vec.get_feature_names() # vocab to be converted to id2word \n", "\n", - "id2word=dict([(i, s) for i, s in enumerate(vocab)])" + "id2word = dict([(i, s) for i, s in enumerate(vocab)])" ] }, { @@ -230,8 +230,8 @@ } ], "source": [ - "obj=SklearnWrapperLdaModel(id2word=id2word,num_topics=5,passes=20)\n", - "lda=obj.fit(X)\n", + "obj = SklearnWrapperLdaModel(id2word=id2word, num_topics=5, passes=20)\n", + "lda = obj.fit(X)\n", "lda.print_topics()" ] }, @@ -264,7 +264,7 @@ }, "outputs": [], "source": [ - "def scorer(estimator, X,y=None):\n", + "def scorer(estimator, X, y=None):\n", " goodcm = CoherenceModel(model=estimator, texts= texts, dictionary=estimator.id2word, coherence='c_v')\n", " return goodcm.get_coherence()" ] @@ -297,8 +297,8 @@ } ], "source": [ - "obj=SklearnWrapperLdaModel(id2word=dictionary,num_topics=5,passes=20)\n", - "parameters = {'num_topics':(2, 3, 5, 10), 'iterations':(1,20,50)}\n", + "obj = SklearnWrapperLdaModel(id2word=dictionary, num_topics=5, passes=20)\n", + "parameters = {'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)}\n", "model = GridSearchCV(obj, parameters, scoring=scorer, cv=5)\n", "model.fit(corpus)" ] @@ -342,12 +342,14 @@ "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn import linear_model\n", + "\n", + "\n", "def print_features_pipe(clf, vocab, n=10):\n", " ''' Better printing for sorted list '''\n", " coef = clf.named_steps['classifier'].coef_[0]\n", " print coef\n", " print 'Positive features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[::-1][:n] if coef[j] > 0]))\n", - " print 'Negative features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[:n] if coef[j] < 0]))\n" + " print 'Negative features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[:n] if coef[j] < 0]))" ] }, { @@ -358,7 +360,7 @@ }, "outputs": [], "source": [ - "id2word=Dictionary(map(lambda x : x.split(),data.data))\n", + "id2word = Dictionary([_.split() for _ in data.data])\n", "corpus = [id2word.doc2bow(i.split()) for i in data.data]" ] }, @@ -391,8 +393,8 @@ } ], "source": [ - "model=SklearnWrapperLdaModel(num_topics=15,id2word=id2word,iterations=50, random_state=37)\n", - "clf=linear_model.LogisticRegression(penalty='l2', C=0.1) #l2 penalty used\n", + "model = SklearnWrapperLdaModel(num_topics=15, id2word=id2word, iterations=50, random_state=37)\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", "pipe.fit(corpus, data.target)\n", "print_features_pipe(pipe, id2word.values())\n", @@ -452,22 +454,13 @@ } ], "source": [ - "model=SklearnWrapperLsiModel(num_topics=15, id2word=id2word)\n", - "clf=linear_model.LogisticRegression(penalty='l2', C=0.1) #l2 penalty used\n", + "model = SklearnWrapperLsiModel(num_topics=15, id2word=id2word)\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", "pipe.fit(corpus, data.target)\n", "print_features_pipe(pipe, id2word.values())\n", "print pipe.score(corpus, data.target)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/notebooks/word2vec.ipynb b/docs/notebooks/word2vec.ipynb index 4afe8f4134..61679cea4f 100644 --- a/docs/notebooks/word2vec.ipynb +++ b/docs/notebooks/word2vec.ipynb @@ -785,7 +785,7 @@ } ], "source": [ - "model.evaluate_word_pairs(test_data_dir +'wordsim353.tsv')" + "model.evaluate_word_pairs(test_data_dir + 'wordsim353.tsv')" ] }, { @@ -907,8 +907,7 @@ ], "source": [ "model = gensim.models.Word2Vec.load(temp_path)\n", - "more_sentences = [['Advanced', 'users', 'can', 'load', 'a', 'model', 'and', 'continue', \n", - " 'training', 'it', 'with', 'more', 'sentences']]\n", + "more_sentences = [['Advanced', 'users', 'can', 'load', 'a', 'model', 'and', 'continue', 'training', 'it', 'with', 'more', 'sentences']]\n", "model.build_vocab(more_sentences, update=True)\n", "model.train(more_sentences, total_examples=model.corpus_count, epochs=model.iter)\n", "\n", @@ -1023,7 +1022,7 @@ } ], "source": [ - "print(model.predict_output_word(['emergency','beacon','received']))" + "print(model.predict_output_word(['emergency', 'beacon', 'received']))" ] }, { diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 484684c26d..1ff89a5b31 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -194,9 +194,11 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N # determine which tokens to keep if keep_tokens: keep_ids = [self.token2id[v] for v in keep_tokens if v in self.token2id] - good_ids = (v for v in itervalues(self.token2id) - if no_below <= self.dfs.get(v, 0) <= no_above_abs - or v in keep_ids) + good_ids = ( + v for v in itervalues(self.token2id) + if no_below <= self.dfs.get(v, 0) <= no_above_abs + or v in keep_ids + ) else: good_ids = ( v for v in itervalues(self.token2id) diff --git a/gensim/matutils.py b/gensim/matutils.py index fbfa383a34..057e65a52f 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -532,8 +532,17 @@ def jaccard(vec1, vec2): return 1 - float(len(intersection)) / float(len(union)) -def jaccard_set(set1, set2): - return 1. - float(len(set1 & set2)) / float(len(set1 | set2)) +def jaccard_distance(set1, set2): + """ + Calculate a distance between set representation (1 minus the intersection divided by union). + Return a value in range <0, 1> where values closer to 0 mean smaller distance and thus higher similarity. + """ + + union_cardinality = len(set1 | set2) + if union_cardinality == 0: # Both sets are empty + return 1. + + return 1. - float(len(set1 & set2)) / float(union_cardinality) def dirichlet_expectation(alpha): diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index ee6035a449..6937d928d4 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -33,7 +33,9 @@ from __future__ import with_statement -import logging, time +import logging +import time +import warnings import numpy as np from scipy.special import gammaln, psi # gamma function utils @@ -614,16 +616,14 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): def print_topic(self, topic_id, topn= None, num_words=None): if num_words is not None: # deprecated num_words is used - logger.warning("The parameter num_words for print_topic() would be deprecated in the updated version.") - logger.warning("Please use topn instead.") + warnings.warn("The parameter num_words for print_topic() would be deprecated in the updated version. Please use topn instead.") topn = num_words return self.show_topic(topic_id, topn, formatted=True) def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words= None,): if num_words is not None: # deprecated num_words is used - logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.") - logger.warning("Please use topn instead.") + warnings.warn("The parameter num_words for show_topic() would be deprecated in the updated version. Please use topn instead.") topn = num_words lambdak = list(self.data[topic_id, :]) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index d9ab324d51..77f7e68be9 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -39,7 +39,7 @@ from gensim import interfaces, utils, matutils from gensim.matutils import dirichlet_expectation from gensim.models import basemodel -from gensim.matutils import kullback_leibler, hellinger, jaccard_set +from gensim.matutils import kullback_leibler, hellinger, jaccard_distance from itertools import chain from scipy.special import gammaln, psi # gamma function utils @@ -989,9 +989,11 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, >>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2` """ - distances = {"kulback_leibler": kullback_leibler, - "hellinger": hellinger, - "jaccard": jaccard_set} + distances = { + "kulback_leibler": kullback_leibler, + "hellinger": hellinger, + "jaccard": jaccard_distance, + } if distance not in distances: valid_keys = ", ".join("`{}`".format(x) for x in distances.keys()) @@ -1019,7 +1021,7 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, if np.abs(np.max(z)) > 1e-8: z /= np.max(z) - annotation = [[None for _ in range(t1_size)] for _ in range(t2_size)] + annotation = [[None] * t1_size for _ in range(t2_size)] for topic1 in range(t1_size): for topic2 in range(t2_size): @@ -1118,7 +1120,7 @@ def load(cls, fname, *args, **kwargs): kwargs['mmap'] = kwargs.get('mmap', None) result = super(LdaModel, cls).load(fname, *args, **kwargs) - # check if `random_state` attribute has been set after main pickel load + # check if `random_state` attribute has been set after main pickle load # if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim # if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so set `random_state` as the default value if not hasattr(result, 'random_state'): @@ -1134,7 +1136,7 @@ def load(cls, fname, *args, **kwargs): id2word_fname = utils.smart_extension(fname, '.id2word') # check if `id2word_fname` file is present on disk # if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, so set `result.id2word` using the `id2word_fname` file - # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickel load + # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickle load if (os.path.isfile(id2word_fname)): try: result.id2word = utils.unpickle(id2word_fname) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index aaa15660a1..85aeefe173 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -205,7 +205,6 @@ def score_sentence_sg(model, sentence, work=None): will use the optimized version from word2vec_inner instead. """ - log_prob_sentence = 0.0 if model.negative: raise RuntimeError("scoring is only available for HS=True") @@ -483,7 +482,6 @@ def __init__( logger.warning("The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ") logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored." ) - def initialize_word_vectors(self): self.wv = KeyedVectors() @@ -1208,7 +1206,6 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, i Deprecated. Use self.wv.most_similar() instead. Refer to the documentation for `gensim.models.KeyedVectors.most_similar` """ - return self.wv.most_similar(positive, negative, topn, restrict_vocab, indexer) def wmdistance(self, document1, document2): @@ -1216,7 +1213,6 @@ def wmdistance(self, document1, document2): Deprecated. Use self.wv.wmdistance() instead. Refer to the documentation for `gensim.models.KeyedVectors.wmdistance` """ - return self.wv.wmdistance(document1, document2) def most_similar_cosmul(self, positive=[], negative=[], topn=10): @@ -1224,7 +1220,6 @@ def most_similar_cosmul(self, positive=[], negative=[], topn=10): Deprecated. Use self.wv.most_similar_cosmul() instead. Refer to the documentation for `gensim.models.KeyedVectors.most_similar_cosmul` """ - return self.wv.most_similar_cosmul(positive, negative, topn) def similar_by_word(self, word, topn=10, restrict_vocab=None): @@ -1232,7 +1227,6 @@ def similar_by_word(self, word, topn=10, restrict_vocab=None): Deprecated. Use self.wv.similar_by_word() instead. Refer to the documentation for `gensim.models.KeyedVectors.similar_by_word` """ - return self.wv.similar_by_word(word, topn, restrict_vocab) def similar_by_vector(self, vector, topn=10, restrict_vocab=None): @@ -1240,7 +1234,6 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None): Deprecated. Use self.wv.similar_by_vector() instead. Refer to the documentation for `gensim.models.KeyedVectors.similar_by_vector` """ - return self.wv.similar_by_vector(vector, topn, restrict_vocab) def doesnt_match(self, words): @@ -1248,7 +1241,6 @@ def doesnt_match(self, words): Deprecated. Use self.wv.doesnt_match() instead. Refer to the documentation for `gensim.models.KeyedVectors.doesnt_match` """ - return self.wv.doesnt_match(words) def __getitem__(self, words): @@ -1256,7 +1248,6 @@ def __getitem__(self, words): Deprecated. Use self.wv.__getitem__() instead. Refer to the documentation for `gensim.models.KeyedVectors.__getitem__` """ - return self.wv.__getitem__(words) def __contains__(self, word): @@ -1264,7 +1255,6 @@ def __contains__(self, word): Deprecated. Use self.wv.__contains__() instead. Refer to the documentation for `gensim.models.KeyedVectors.__contains__` """ - return self.wv.__contains__(word) def similarity(self, w1, w2): @@ -1272,7 +1262,6 @@ def similarity(self, w1, w2): Deprecated. Use self.wv.similarity() instead. Refer to the documentation for `gensim.models.KeyedVectors.similarity` """ - return self.wv.similarity(w1, w2) def n_similarity(self, ws1, ws2): @@ -1280,7 +1269,6 @@ def n_similarity(self, ws1, ws2): Deprecated. Use self.wv.n_similarity() instead. Refer to the documentation for `gensim.models.KeyedVectors.n_similarity` """ - return self.wv.n_similarity(ws1, ws2) def predict_output_word(self, context_words_list, topn=10): @@ -1347,7 +1335,6 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs): Deprecated. Use self.wv.log_evaluate_word_pairs() instead. Refer to the documentation for `gensim.models.KeyedVectors.log_evaluate_word_pairs` """ - return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs) def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): @@ -1355,7 +1342,6 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case Deprecated. Use self.wv.evaluate_word_pairs() instead. Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs` """ - return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown) def __str__(self): diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 54356812c7..94a2e5eb1a 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -22,6 +22,7 @@ import logging import random +import warnings import tempfile import os from subprocess import PIPE @@ -93,7 +94,7 @@ def __init__( lencorpus = sum(1 for _ in corpus) if lencorpus == 0: raise ValueError("cannot compute DTM over an empty corpus") - if model == "fixed" and any([i == 0 for i in [len(text) for text in corpus]]): + if model == "fixed" and any(not text for text in corpus): raise ValueError("""There is a text without words in the input corpus. This breaks method='fixed' (The DIM model).""") if lencorpus != sum(time_slices): @@ -308,8 +309,7 @@ def show_topic(self, topicid, time, topn=50, num_words=None): def print_topic(self, topicid, time, topn=10, num_words=None): """Return the given topic, formatted as a string.""" if num_words is not None: # deprecated num_words is used - logger.warning("The parameter num_words for print_topic(() would be deprecated in the updated version.") - logger.warning("Please use topn instead.") + warnings.warn("The parameter num_words for print_topic() would be deprecated in the updated version. Please use topn instead.") topn = num_words return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, topn)]) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index d97a51e501..640cf11dd8 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -43,7 +43,7 @@ from smart_open import smart_open from gensim import utils, matutils -from gensim.utils import check_output +from gensim.utils import check_output, revdict from gensim.models.ldamodel import LdaModel from gensim.models import basemodel @@ -190,7 +190,7 @@ def load_word_topics(self): if hasattr(self.id2word, 'token2id'): word2id = self.id2word.token2id else: - word2id = dict((v, k) for k, v in iteritems(dict(self.id2word))) + word2id = revdict(self.id2word) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 7620093e37..efeb020199 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -52,7 +52,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, `wr_path` is the path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line - `out_name` is name of the directory which will be created(in wordrank folder) to save embeddings and training data. + `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both. @@ -98,13 +98,13 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, input_fnames = [corpus_file.split('/')[-1], corpus_file.split('/')[-1], cooccurrence_file] output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file] - logger.info("Prepare training data using glove code") + logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) - logger.info("Delete frequencies from vocab file") + logger.info("Deleting frequencies from vocab file") with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py index 003d313e6d..3eae9d0265 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py @@ -16,8 +16,6 @@ from sklearn.base import TransformerMixin, BaseEstimator - - class SklearnWrapperLdaModel(models.LdaModel, TransformerMixin, BaseEstimator): """ Base LDA module @@ -68,7 +66,6 @@ def get_params(self, deep=True): "gamma_threshold": self.gamma_threshold, "minimum_probability": self.minimum_probability, "random_state": self.random_state} - def set_params(self, **parameters): """ Set all parameters. @@ -81,7 +78,7 @@ def fit(self, X, y=None): """ For fitting corpus into the class object. Calls gensim.model.LdaModel: - >>>gensim.models.LdaModel(corpus=corpus,num_topics=num_topics,id2word=id2word,passes=passes,update_every=update_every,alpha=alpha,iterations=iterations,eta=eta,random_state=random_state) + >>> gensim.models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=passes, update_every=update_every, alpha=alpha, iterations=iterations, eta=eta, random_state=random_state) """ if sparse.issparse(X): self.corpus = matutils.Sparse2Corpus(X) @@ -106,16 +103,15 @@ def transform(self, docs, minimum_probability=None): # The input as array of array check = lambda x: [x] if isinstance(x[0], tuple) else x docs = check(docs) - X = [[] for i in range(0,len(docs))]; - for k,v in enumerate(docs): + X = [[] for _ in range(0, len(docs))] + for k, v in enumerate(docs): doc_topics = self.get_document_topics(v, minimum_probability=minimum_probability) probs_docs = list(map(lambda x: x[1], doc_topics)) # Everything should be equal in length if len(probs_docs) != self.num_topics: probs_docs.extend([1e-12]*(self.num_topics - len(probs_docs))) X[k] = probs_docs - probs_docs = [] return np.reshape(np.array(X), (len(docs), self.num_topics)) def get_topic_dist(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False): @@ -134,4 +130,4 @@ def partial_fit(self, X): if sparse.issparse(X): X = matutils.Sparse2Corpus(X) - self.update(corpus=X) \ No newline at end of file + self.update(corpus=X) diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py index 6eb5db2776..cb6c514612 100644 --- a/gensim/test/test_sklearn_integration.py +++ b/gensim/test/test_sklearn_integration.py @@ -11,7 +11,7 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.datasets import load_files from sklearn import linear_model -except: +except ImportError: raise unittest.SkipTest("Test requires scikit-learn to be installed, which is not available") from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel @@ -19,18 +19,20 @@ from gensim.corpora import Dictionary from gensim import matutils -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) -texts = [['complier', 'system', 'computer'], - ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'], - ['graph', 'flow', 'network', 'graph'], - ['loading', 'computer', 'system'], - ['user', 'server', 'system'], - ['tree', 'hamiltonian'], - ['graph', 'trees'], - ['computer', 'kernel', 'malfunction', 'computer'], - ['server', 'system', 'computer']] +texts = [ + ['complier', 'system', 'computer'], + ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'], + ['graph', 'flow', 'network', 'graph'], + ['loading', 'computer', 'system'], + ['user', 'server', 'system'], + ['tree', 'hamiltonian'], + ['graph', 'trees'], + ['computer', 'kernel', 'malfunction', 'computer'], + ['server', 'system', 'computer'], +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] @@ -48,49 +50,49 @@ def testPrintTopic(self): self.assertTrue(isinstance(k, int)) def testTransform(self): - texts_new = ['graph','eulerian'] + texts_new = ['graph', 'eulerian'] bow = self.model.id2word.doc2bow(texts_new) - X = self.model.transform(bow) - self.assertTrue(X.shape[0], 1) - self.assertTrue(X.shape[1], self.model.num_topics) - texts_new = [['graph','eulerian'],['server', 'flow'], ['path', 'system']] + matrix = self.model.transform(bow) + self.assertTrue(matrix.shape[0], 1) + self.assertTrue(matrix.shape[1], self.model.num_topics) + texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']] bow = [] for i in texts_new: bow.append(self.model.id2word.doc2bow(i)) - X = self.model.transform(bow) - self.assertTrue(X.shape[0], 3) - self.assertTrue(X.shape[1], self.model.num_topics) + matrix = self.model.transform(bow) + self.assertTrue(matrix.shape[0], 3) + self.assertTrue(matrix.shape[1], self.model.num_topics) def testGetTopicDist(self): - texts_new = ['graph','eulerian'] + texts_new = ['graph', 'eulerian'] bow = self.model.id2word.doc2bow(texts_new) - doc_topics, word_topics, phi_values = self.model.get_topic_dist(bow,per_word_topics=True) + doc_topics, word_topics, phi_values = self.model.get_topic_dist(bow, per_word_topics=True) - for k,v in word_topics: + for k, v in word_topics: self.assertTrue(isinstance(v, list)) self.assertTrue(isinstance(k, int)) - for k,v in doc_topics: + for k, v in doc_topics: self.assertTrue(isinstance(v, float)) self.assertTrue(isinstance(k, int)) - for k,v in phi_values: + for k, v in phi_values: self.assertTrue(isinstance(v, list)) self.assertTrue(isinstance(k, int)) def testPartialFit(self): for i in range(10): self.model.partial_fit(X=corpus) # fit against the model again - doc=list(corpus)[0] # transform only the first document + doc = list(corpus)[0] # transform only the first document transformed = self.model[doc] transformed_approx = matutils.sparse2full(transformed, 2) # better approximation - expected=[0.13, 0.87] + expected = [0.13, 0.87] passed = numpy.allclose(sorted(transformed_approx), sorted(expected), atol=1e-1) self.assertTrue(passed) def testCSRMatrixConversion(self): - Arr = numpy.array([[1, 2, 0], [0, 0, 3], [1, 0, 0]]) - sArr = sparse.csr_matrix(Arr) + arr = numpy.array([[1, 2, 0], [0, 0, 3], [1, 0, 0]]) + sarr = sparse.csr_matrix(arr) newmodel = SklearnWrapperLdaModel(num_topics=2, passes=100) - newmodel.fit(sArr) + newmodel.fit(sarr) topic = newmodel.print_topics() for k, v in topic: self.assertTrue(isinstance(v, six.string_types)) @@ -98,20 +100,21 @@ def testCSRMatrixConversion(self): def testPipeline(self): model = SklearnWrapperLdaModel(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0)) - with open(datapath('mini_newsgroup'),'rb') as f: + with open(datapath('mini_newsgroup'), 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache - id2word=Dictionary(map(lambda x : x.split(), data.data)) + id2word = Dictionary(map(lambda x: x.split(), data.data)) corpus = [id2word.doc2bow(i.split()) for i in data.data] - rand = numpy.random.mtrand.RandomState(1) # set seed for getting same result - clf=linear_model.LogisticRegression(penalty='l2', C=0.1) + numpy.random.mtrand.RandomState(1) # set seed for getting same result + clf = linear_model.LogisticRegression(penalty='l2', C=0.1) text_lda = Pipeline((('features', model,), ('classifier', clf))) text_lda.fit(corpus, data.target) score = text_lda.score(corpus, data.target) self.assertGreater(score, 0.40) + class TestSklearnLSIWrapper(unittest.TestCase): def setUp(self): self.model = SklearnWrapperLsiModel(id2word=dictionary, num_topics=2) @@ -124,39 +127,39 @@ def testModelSanity(self): self.assertTrue(isinstance(k, int)) def testTransform(self): - texts_new = ['graph','eulerian'] + texts_new = ['graph', 'eulerian'] bow = self.model.id2word.doc2bow(texts_new) - X = self.model.transform(bow) - self.assertTrue(X.shape[0], 1) - self.assertTrue(X.shape[1], self.model.num_topics) - texts_new = [['graph','eulerian'],['server', 'flow'], ['path', 'system']] + matrix = self.model.transform(bow) + self.assertTrue(matrix.shape[0], 1) + self.assertTrue(matrix.shape[1], self.model.num_topics) + texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']] bow = [] for i in texts_new: bow.append(self.model.id2word.doc2bow(i)) - X = self.model.transform(bow) - self.assertTrue(X.shape[0], 3) - self.assertTrue(X.shape[1], self.model.num_topics) + matrix = self.model.transform(bow) + self.assertTrue(matrix.shape[0], 3) + self.assertTrue(matrix.shape[1], self.model.num_topics) def testPartialFit(self): for i in range(10): self.model.partial_fit(X=corpus) # fit against the model again - doc=list(corpus)[0] # transform only the first document + doc = list(corpus)[0] # transform only the first document transformed = self.model[doc] transformed_approx = matutils.sparse2full(transformed, 2) # better approximation - expected=[1.39, 0.0] + expected = [1.39, 0.0] passed = numpy.allclose(sorted(transformed_approx), sorted(expected), atol=1e-1) self.assertTrue(passed) def testPipeline(self): model = SklearnWrapperLsiModel(num_topics=2) - with open(datapath('mini_newsgroup'),'rb') as f: + with open(datapath('mini_newsgroup'), 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache - id2word=Dictionary(map(lambda x : x.split(), data.data)) + id2word = Dictionary(map(lambda x: x.split(), data.data)) corpus = [id2word.doc2bow(i.split()) for i in data.data] - clf=linear_model.LogisticRegression(penalty='l2', C=0.1) + clf = linear_model.LogisticRegression(penalty='l2', C=0.1) text_lda = Pipeline((('features', model,), ('classifier', clf))) text_lda.fit(corpus, data.target) score = text_lda.score(corpus, data.target) diff --git a/gensim/test/test_tmdiff.py b/gensim/test/test_tmdiff.py index 2a00f81b01..03a639e454 100644 --- a/gensim/test/test_tmdiff.py +++ b/gensim/test/test_tmdiff.py @@ -13,15 +13,17 @@ class TestLdaDiff(unittest.TestCase): def setUp(self): - texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] + texts = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'], + ] self.dictionary = Dictionary(texts) self.corpus = [self.dictionary.doc2bow(text) for text in texts] self.num_topics = 5 diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 8922c511a3..2d06d58d01 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -9,28 +9,28 @@ """ import logging +from itertools import chain, islice import numpy as np -from gensim.corpora import Dictionary - -from itertools import chain, islice logger = logging.getLogger(__name__) + def _ret_top_ids(segmented_topics): """ Helper function to return a set of all the unique topic ids in segmented topics. """ top_ids = set() # is a set of all the unique ids contained in topics. for s_i in segmented_topics: - for id in chain.from_iterable(s_i): - if isinstance(id, np.ndarray): - for i in id: + for t_id in chain.from_iterable(s_i): + if isinstance(t_id, np.ndarray): + for i in t_id: top_ids.add(i) else: - top_ids.add(id) + top_ids.add(t_id) return top_ids + def p_boolean_document(corpus, segmented_topics): """ This function performs the boolean document probability estimation. Boolean document estimates the probability @@ -49,17 +49,16 @@ def p_boolean_document(corpus, segmented_topics): top_ids = _ret_top_ids(segmented_topics) # Instantiate the dictionary with empty sets for each top_id per_topic_postings = {} - for id in top_ids: - per_topic_postings[id] = set() + for t_id in top_ids: + per_topic_postings[t_id] = set() # Iterate through the documents, appending the document number to the set for each top_id it contains for n, document in enumerate(corpus): doc_words = frozenset(x[0] for x in document) - top_ids_in_doc = top_ids.intersection(doc_words) - if len(top_ids_in_doc) > 0: - for id in top_ids_in_doc: - per_topic_postings[id].add(n) + for word_id in top_ids.intersection(doc_words): + per_topic_postings[word_id].add(n) num_docs = len(corpus) - return (per_topic_postings, num_docs) + return per_topic_postings, num_docs + def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): """ @@ -84,6 +83,7 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): window_id = 0 # Each window assigned a window id. per_topic_postings = {} token2id_dict = dictionary.token2id + def add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict): for word in window: word_id = token2id_dict[word] @@ -91,9 +91,10 @@ def add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_d if word_id in per_topic_postings: per_topic_postings[word_id].add(window_id) else: - per_topic_postings[word_id] = set([window_id]) + per_topic_postings[word_id] = {window_id} window_id += 1 - return (window_id, per_topic_postings) + return window_id, per_topic_postings + # Apply boolean sliding window to each document in texts. for document in texts: it = iter(document) diff --git a/gensim/utils.py b/gensim/utils.py index 5fa91c5032..5884dc9234 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -943,7 +943,7 @@ def revdict(d): result (which one is kept is arbitrary). """ - return dict((v, k) for (k, v) in iteritems(d)) + return dict((v, k) for (k, v) in iteritems(dict(d))) def toptexts(query, texts, index, n=10): @@ -1164,7 +1164,7 @@ def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs): Added extra KeyboardInterrupt handling """ try: - logger.debug("COMMAND: %s %s", str(popenargs), str(kwargs)) + logger.debug("COMMAND: %s %s", popenargs, kwargs) process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs) output, unused_err = process.communicate() retcode = process.poll() diff --git a/setup.py b/setup.py index 4b86cf78e2..2c8be1f896 100644 --- a/setup.py +++ b/setup.py @@ -226,11 +226,14 @@ def finalize_options(self): """ -test_env = ['testfixtures', - 'unittest2', - 'Morfessor==2.0.2a4', - 'scikit-learn', - 'pyemd'] +test_env = [ + 'testfixtures', + 'unittest2', + 'Morfessor==2.0.2a4', + 'scikit-learn', + 'pyemd', + 'annoy', +] setup( name='gensim', @@ -287,7 +290,6 @@ def finalize_options(self): 'scipy >= 0.7.0', 'six >= 1.5.0', 'smart_open >= 1.2.1', - 'morfessor==2.0.2alpha4', ], tests_require=test_env, extras_require={