piskvorky · menshikh-iv · Jan 15, 2018 · Oct 20, 2016 · Dec 2, 2016 · Dec 15, 2017
diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
@@ -11,24 +11,27 @@
 from gensim import interfaces, matutils, utils
 from six import iteritems
 
+import numpy as np
 
 logger = logging.getLogger(__name__)
 
 
-def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
-    """
-    Compute default inverse-document-frequency for a term with document frequency `doc_freq`::
+def resolve_weights(smartirs):
+    if not isinstance(smartirs, str) or len(smartirs) != 3:
+        raise ValueError('Expected a string of length 3 except got ' + smartirs)
 
-      idf = add + log(totaldocs / doc_freq)
-    """
-    return add + math.log(1.0 * totaldocs / docfreq, log_base)
+    w_tf, w_df, w_n = smartirs
+
+    if w_tf not in 'nlabL':
+        raise ValueError('Expected term frequency weight to be one of nlabL, except got ' + w_tf)
 
+    if w_df not in 'ntp':
+        raise ValueError('Expected inverse document frequency weight to be one of ntp, except got ' + w_df)
 
-def precompute_idfs(wglobal, dfs, total_docs):
-    """Precompute the inverse document frequency mapping for all terms."""
-    # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
-    # this method is here just to speed things up a little.
-    return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}
+    if w_n not in 'ncb':
+        raise ValueError('Expected normalization weight to be one of ncb, except got ' + w_n)
+
+    return w_tf, w_df, w_n
 
 
 class TfidfModel(interfaces.TransformationABC):
@@ -49,8 +52,8 @@ class TfidfModel(interfaces.TransformationABC):
     Model persistency is achieved via its load/save methods.
     """
 
-    def __init__(self, corpus=None, id2word=None, dictionary=None,
-                 wlocal=utils.identity, wglobal=df2idf, normalize=True):
+    def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc",
+                 wlocal=None, wglobal=None, normalize=None):
         """
         Compute tf-idf by multiplying a local component (term frequency) with a
         global component (inverse document frequency), and normalizing
@@ -78,10 +81,41 @@ def __init__(self, corpus=None, id2word=None, dictionary=None,
         and it will be used to directly construct the inverse document frequency
         mapping (then `corpus`, if specified, is ignored).
         """
-        self.normalize = normalize
         self.id2word = id2word
-        self.wlocal, self.wglobal = wlocal, wglobal
+        self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
         self.num_docs, self.num_nnz, self.idfs = None, None, None
+        n_tf, n_df, n_n = smartirs
+        self.smartirs = smartirs
+
+        if self.wlocal is None:
+            if n_tf == "n":
+                self.wlocal = lambda tf, mean=None, _max=None: tf
+            elif n_tf == "l":
+                self.wlocal = lambda tf, mean=None, _max=None: 1 + math.log(tf)
+            elif n_tf == "a":
+                self.wlocal = lambda tf, mean=None, _max=None: 0.5 + (0.5 * tf / _max)
+            elif n_tf == "b":
+                self.wlocal = lambda tf, mean=None, _max=None: 1 if tf > 0 else 0
+            elif n_tf == "L":
+                self.wlocal = lambda tf, mean=None, _max=None: (1 + math.log(tf)) / (1 + math.log(mean))
+
+        if self.wglobal is None:
+            if n_df == "n":
+                self.wglobal = utils.identity
+            elif n_df == "t":
+                self.wglobal = lambda docfreq, totaldocs: math.log(1.0 * totaldocs / docfreq, 10)
+            elif n_tf == "p":
+                self.wglobal = lambda docfreq, totaldocs: math.log((float(totaldocs) - docfreq) / docfreq)
+
+        if self.normalize is None or isinstance(self.normalize, bool):
+            if n_n == "n" or self.normalize is False:
+                self.normalize = lambda x: x
+            elif n_n == "c" or self.normalize is True:
+                self.normalize = matutils.unitvec
+            # TODO write byte-size normalisation
+            # elif n_n == "b":
+            #     self.normalize = matutils.unitvec
+
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the
             # statistics we need to construct the IDF mapping. we can skip the
@@ -92,7 +126,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None,
                 )
             self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
             self.dfs = dictionary.dfs.copy()
-            self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
+
             if id2word is None:
                 self.id2word = dictionary
         elif corpus is not None:
@@ -113,6 +147,7 @@ def initialize(self, corpus):
         logger.info("collecting document frequencies")
         dfs = {}
         numnnz, docno = 0, -1
+
         for docno, bow in enumerate(corpus):
             if docno % 10000 == 0:
                 logger.info("PROGRESS: processing document #%i", docno)
@@ -127,11 +162,6 @@ def initialize(self, corpus):
 
         # and finally compute the idf weights
         n_features = max(dfs) if dfs else 0
-        logger.info(
-            "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
-            self.num_docs, n_features, self.num_nnz
-        )
-        self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
 
     def __getitem__(self, bow, eps=1e-12):
         """
@@ -144,17 +174,16 @@ def __getitem__(self, bow, eps=1e-12):
 
         # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
         # as strict application of the IDF formula would dictate)
+
         vector = [
-            (termid, self.wlocal(tf) * self.idfs.get(termid))
-            for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0
+            (termid, self.wlocal(tf, mean=np.mean(np.array(bow), axis=1), _max=np.max(bow, axis=1)) * self.wglobal(self.dfs[termid], self.num_docs))
+            for termid, tf in bow if self.wglobal(self.dfs[termid], self.num_docs) != 0.0
         ]
 
         # and finally, normalize the vector either to unit length, or use a
         # user-defined normalization function
-        if self.normalize is True:
-            vector = matutils.unitvec(vector)
-        elif self.normalize:
-            vector = self.normalize(vector)
+
+        vector = self.normalize(vector)
 
         # make sure there are no explicit zeroes in the vector (must be sparse)
         vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]

diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
@@ -21,14 +21,15 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator):
     Base Tf-Idf module
     """
 
-    def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
-                 wglobal=gensim.models.tfidfmodel.df2idf, normalize=True):
+    def __init__(self, id2word=None, dictionary=None, smartirs="ntc", wlocal=None,
+                 wglobal=None, normalize=True):
         """
         Sklearn wrapper for Tf-Idf model.
         """
         self.gensim_model = None
         self.id2word = id2word
         self.dictionary = dictionary
+        self.smartirs = smartirs
         self.wlocal = wlocal
         self.wglobal = wglobal
         self.normalize = normalize
@@ -38,7 +39,7 @@ def fit(self, X, y=None):
         Fit the model according to the given training data.
         """
         self.gensim_model = TfidfModel(
-            corpus=X, id2word=self.id2word, dictionary=self.dictionary,
+            corpus=X, id2word=self.id2word, dictionary=self.dictionary, smartirs=self.smartirs,
             wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize
         )
         return self

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
@@ -498,7 +498,6 @@ def testPersistence(self):
         original_matrix = self.model.transform(original_bow)
         passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1)
         self.assertTrue(passed)
-
     def testModelNotFitted(self):
         lsi_wrapper = LsiTransformer(id2word=dictionary, num_topics=2)
         texts_new = ['graph', 'eulerian']
@@ -973,13 +972,13 @@ def testTransform(self):
 
     def testSetGetParams(self):
         # updating only one param
-        self.model.set_params(normalize=False)
+        self.model.set_params(smartirs='nnn')
         model_params = self.model.get_params()
-        self.assertEqual(model_params["normalize"], False)
+        self.assertEqual(model_params["smartirs"], 'nnn')
 
         # verify that the attributes values are also changed for `gensim_model` after fitting
         self.model.fit(self.corpus)
-        self.assertEqual(getattr(self.model.gensim_model, 'normalize'), False)
+        self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn')
 
     def testPipeline(self):
         with open(datapath('mini_newsgroup'), 'rb') as f: