diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index e4d0f0490f..60ba004353 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -145,7 +145,7 @@ def convert_input(self, corpus, infer=False, serialize_corpus=True): self.corpus2mallet(corpus, fout) # convert the text file above into MALLET's internal format - cmd = self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s" + cmd = self.mallet_path + ' import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input %s --output %s' if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') @@ -166,7 +166,7 @@ def train(self, corpus): logger.info("training MALLET LDA with %s", cmd) check_output(cmd, shell=True) self.word_topics = self.load_word_topics() - # NOTE - we are still keeping the wordtopics variable to not break backward compatibility. + # NOTE - we are still keeping the wordtopics variable to not break backward compatibility. # word_topics has replaced wordtopics throughout the code; wordtopics just stores the values of word_topics when train is called. self.wordtopics = self.word_topics @@ -260,12 +260,12 @@ def get_version(self, direc_path): Check version of mallet via jar file """ archive = zipfile.ZipFile(direc_path, 'r') - if u'cc/mallet/regression/' not in archive.namelist(): + if u'cc/mallet/regression/' not in archive.namelist(): return '2.0.7' else: return '2.0.8RC3' except Exception: - + xml_path = direc_path.split("bin")[0] try: doc = et.parse(xml_path + "pom.xml").getroot() @@ -273,7 +273,7 @@ def get_version(self, direc_path): return doc.find(namespace + 'version').text.split("-")[0] except Exception: return "Can't parse pom.xml version file" - + def read_doctopics(self, fname, eps=1e-6, renorm=True): @@ -304,7 +304,7 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True): if mallet_version == "2.0.7": """ - 1 1 0 1.0780612802674239 30.005575655428533364 2 0.005575655428533364 1 0.005575655428533364 + 1 1 0 1.0780612802674239 30.005575655428533364 2 0.005575655428533364 1 0.005575655428533364 2 2 0 0.9184413079632608 40.009062076892971008 3 0.009062076892971008 2 0.009062076892971008 1 0.009062076892971008 In the above example there is a mix of the above if and elif statement. There are neither `2*num_topics` nor `num_topics` elements. It has 2 formats 40.009062076892971008 and 0 1.0780612802674239 which cannot be handled by above if elif. @@ -316,14 +316,14 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True): doc = [] if len(parts) > 0: while count < len(parts): - """ + """ if section is to deal with formats of type 2 0.034 so if count reaches index of 2 and since int(2) == float(2) so if block is executed now there is one extra element afer 2, so count + 1 access should not give an error else section handles formats of type 20.034 now count is there on index of 20.034 since float(20.034) != int(20.034) so else block - is executed + is executed """ if float(parts[count]) == int(parts[count]):