From 75c25ee69f1de671e8ecf98df61559fa7cdd0f66 Mon Sep 17 00:00:00 2001 From: Zohaggie Date: Fri, 10 Aug 2018 07:32:36 +1000 Subject: [PATCH 01/16] added random_seed parameter to ldamallet wrapper --- gensim/models/wrappers/ldamallet.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index fcd3b4aa6a..6f15e9d5c7 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -74,7 +74,7 @@ class LdaMallet(utils.SaveLoad, basemodel.BaseTopicModel): """ def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=None, workers=4, prefix=None, - optimize_interval=0, iterations=1000, topic_threshold=0.0): + optimize_interval=0, iterations=1000, topic_threshold=0.0, random_seed=None): """ Parameters @@ -100,6 +100,8 @@ def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=N Number of training iterations. topic_threshold : float, optional Threshold of the probability above which we consider a topic. + random_seed: int, optional + Random seed to ensure consistent results, default is None """ self.mallet_path = mallet_path @@ -122,6 +124,7 @@ def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=N self.workers = workers self.optimize_interval = optimize_interval self.iterations = iterations + self.random_seed = random_seed if corpus is not None: self.train(corpus) @@ -268,11 +271,16 @@ def train(self, corpus): cmd = self.mallet_path + ' train-topics --input %s --num-topics %s --alpha %s --optimize-interval %s '\ '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s '\ '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s' + + if self.random_seed != None: + cmd += ' --random-seed ' + str(self.random_seed) + cmd = cmd % ( self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer(), self.topic_threshold ) + # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory logger.info("training MALLET LDA with %s", cmd) check_output(args=cmd, shell=True) From 1bfaf48713bb04da5e64f9bc63f37e86a036bd53 Mon Sep 17 00:00:00 2001 From: Zohaggie Date: Fri, 10 Aug 2018 22:17:39 +1000 Subject: [PATCH 02/16] added load for backwards compatibility --- gensim/models/wrappers/ldamallet.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 6f15e9d5c7..0896507243 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -569,6 +569,16 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True): doc = [(id_, float(weight) / total_weight) for id_, weight in doc] yield doc + @classmethod + def load(cls, *args, **kwargs): + """Load a previously saved LdaMallet class. Handles backwards compatibility from + older LdaMallet versions which did not use random_seed parameter. + """ + model = super(LdaMallet, cls).load(*args, **kwargs) + if not hasattr(model, 'random_seed'): + model.random_seed = None + + return model def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): """Convert :class:`~gensim.models.wrappers.ldamallet.LdaMallet` to :class:`~gensim.models.ldamodel.LdaModel`. From 2ca085c2ed8660f544ea0d8574a082547948b09f Mon Sep 17 00:00:00 2001 From: Zohaggie Date: Sat, 11 Aug 2018 04:53:37 +1000 Subject: [PATCH 03/16] fix random_seed evaluation for PEP8 best practice --- gensim/models/wrappers/ldamallet.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 0896507243..9fadf17e94 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -101,7 +101,7 @@ def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=N topic_threshold : float, optional Threshold of the probability above which we consider a topic. random_seed: int, optional - Random seed to ensure consistent results, default is None + Random seed to ensure consistent results. """ self.mallet_path = mallet_path @@ -272,7 +272,7 @@ def train(self, corpus): '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s '\ '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s' - if self.random_seed != None: + if self.random_seed is not None: cmd += ' --random-seed ' + str(self.random_seed) cmd = cmd % ( @@ -280,7 +280,6 @@ def train(self, corpus): self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer(), self.topic_threshold ) - # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory logger.info("training MALLET LDA with %s", cmd) check_output(args=cmd, shell=True) From 645863c247af1f8a019acfc6f2d86c5eb38fd743 Mon Sep 17 00:00:00 2001 From: Zohaggie Date: Sat, 11 Aug 2018 19:48:14 +1000 Subject: [PATCH 04/16] Various PEP8 compliance fixes --- gensim/models/wrappers/ldamallet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 9fadf17e94..2d00bbd5ef 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -101,7 +101,7 @@ def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=N topic_threshold : float, optional Threshold of the probability above which we consider a topic. random_seed: int, optional - Random seed to ensure consistent results. + Random seed to ensure consistent results. """ self.mallet_path = mallet_path @@ -579,6 +579,7 @@ def load(cls, *args, **kwargs): return model + def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): """Convert :class:`~gensim.models.wrappers.ldamallet.LdaMallet` to :class:`~gensim.models.ldamodel.LdaModel`. From d8e0f54248f8cb9ff1d64db1b69c2f81f5ea03e5 Mon Sep 17 00:00:00 2001 From: Zohaggie Date: Sat, 11 Aug 2018 20:46:05 +1000 Subject: [PATCH 05/16] PEP8: Removed white space from blank lines --- gensim/models/wrappers/ldamallet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 2d00bbd5ef..7cef35bb8f 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -576,7 +576,7 @@ def load(cls, *args, **kwargs): model = super(LdaMallet, cls).load(*args, **kwargs) if not hasattr(model, 'random_seed'): model.random_seed = None - + return model From 53182a0cf94ce1d97a22af90a81df1e86fae0116 Mon Sep 17 00:00:00 2001 From: Zohaggie Date: Wed, 26 Sep 2018 21:10:58 +1000 Subject: [PATCH 06/16] Added tests for random_seed parameter in ldamallet wrapper --- .../Mallet_TMP/MTMP/pre_rs_corpus.mallet | Bin 0 -> 7496 bytes .../MTMP/pre_rs_corpus.mallet.infer | Bin 0 -> 6948 bytes .../Mallet_TMP/MTMP/pre_rs_corpus.txt | 1 + .../Mallet_TMP/MTMP/pre_rs_doctopics.txt | 9 ++++ .../MTMP/pre_rs_doctopics.txt.infer | 2 + .../Mallet_TMP/MTMP/pre_rs_inferencer.mallet | Bin 0 -> 1397 bytes .../Mallet_TMP/MTMP/pre_rs_topickeys.txt | 2 + .../test_data/Mallet_TMP/Mallet_pre_rs.mdl | Bin 0 -> 1361 bytes gensim/test/test_ldamallet_wrapper.py | 46 ++++++++++++++++++ 9 files changed, 60 insertions(+) create mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet create mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet.infer create mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.txt create mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt create mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt.infer create mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_inferencer.mallet create mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_topickeys.txt create mode 100644 gensim/test/test_data/Mallet_TMP/Mallet_pre_rs.mdl diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet new file mode 100644 index 0000000000000000000000000000000000000000..0c504c16fab90664bbcb3c5eddffd3743b51a2c2 GIT binary patch literal 7496 zcmb7}&yOVMS;wpESO3`EWbN2s43@1#LPQ}mU_yxG10=C57LAFO-6WD14sTb#-CdsQ zs@|&V=^i5?bAT)204N6}gc1%YQVvKY@&PVLJ|dBSfGZ#*B&5i><@0^s?wQ_M6Rk(; znd++deSUnO@AKoW`QHCTS=UCtR1_yGS5>}0=`Yv5JGobPy{ij%Qg7#j;28r-I~7Usk8T>$>HeZvEQB=!La!&#TDEi}QfMwyXIjCFlu8lq=$4_M5 z$(P&KT?*3OpZxnv-~Pb;O%e_6MQK;Q>vwBV9&c0qCLk~Te*w7;M*en`S50y5XJ=6= zr2FXEBapu~eFspnG_Bpr%b3YoG+DO|!1SdDxR;JxGxXjH56wb0=3TRCi^JADxMqTO zYcn*$(3fu-dWH9+jfX(j_9pkR*Vl`Fi5;`j0`A&NL+)WT+H(QB;)nN+6g+{%0rG3f z+1t^bMcZuq<^8@{uLa1^*L`bVK7T}yEBx_pq7|3dk>ZYMCDPxUU9qZNm8Y{sar(_S z@4e2ZWaZWZZc%Se`?k68rw;fJj#}vm4oBsvgZ5T z70_$c`flCS9hZ5xX)pYxE-t&?ulgwM%M}mOO{eXNUjt6ow_d@^7Oh(^S-dLirtP+a z=%S788r76zC--r`Ui|Ib0Y+tY6V#BRN5wd#BS_wyg-qX++C;_-L%@((_I<@}eE z+o%e(m#VPwgpHqkc75kxzWB9g<0!cwWks{8``-&*!i`N`LYdwAJ#t{+|Lvc@@z-}= z{NP`A+vgCR@X?PzMOl>W(D3|C>RG;P89!7j2$z-h%ghrK{A~Iwa_a?19CZo*JK?Q6IapOY6gGEvFkjcRU(=-FHoO;b(70Klga2 zynb*uJfQ0ozU!z~H>L}*7b}j+vhFJAXe&S^q4x(ELa(3O;*7!B>~Az3)TQ zeDy(d?(4(1YmgO_%J0+-jt3=c-pBjTwC1&^whcdyY}-T9Mz363Xw%CR8mxQsZjU!%I9(uV& zW-s*p*Z=e{|LTW7_ySvS__oUQZ~u;5jc+k&bPdBq7$~Nm2FK{obbZCZkzO+B_0e!! z_RB^gOD;;L$*f;`4JH$}UbnvB7GJV$v#A&ihErcS1`~GI-myDD2wFCERi1lEowzyJ zZyms~h~Y+S`{l-lmxgp_5?XmzZC!?TJHng?Jt z(08WPgz$0oh!21b0BOMT0YMsE@WGP-YO{Gc@6lprpkaVkqsn$ENC(6u+b~2SBD1g{ zc&5oPt4R&irgd4PXMCWSf-%E{bk=OKCRsFyNw;MuB^96`3G#I1OqdTD4$DTI$Q$f! zxyWkoMHymVDt5zi!G!Bk8GEpSZn{oNo&6@-Gd&tDeW1nm~|rDPI9Z2Oc)p5Xr7` zGTbtD*sUpFb0o)losmtwJ&y?&{3i-RL6HjeAl=7~KSCVY2MvuUuJIK{E8`M+m_(G8!z4#x?S%`# zXITur3BkVgk~=8O9YNE>#OZfVdRok53StyDs-a@HM zT~gn0EZss7;RNDVsMrr@B?jW8-l^?`h88NxWM4o`*l<~{f}B|WKej3@z`cq??stWi zStbu-dbgqN-wr*8Qeu?ING;7ED&^CV8jK_R(g8235_*XXTO~Oes4^gQl#%+5wMn<3 z$*0)Qn#9QpUZDi}^j;N=OubDW1rE_rjbd>p3N0u?TH{m#_-h?NdvF}t#l){-|aHWsO-1fLb-Wa%vOdm5+j`WEMN z3an*FszDQ}BDOLb{SY-8?FUe}f&do8hV>lC3?Z9EVNO5faZp1&^Sq7|-G#2K z;N`c#U4>ZtHxqxM=DYQk84ryYqc6p>u zfqIPrR(awJv}BggJgu2+aD<~JH)mn;PMvWn2Xiwx*AOo?!vbFxVn$M{D#TlMDKqyi zg}b1{Z;3Run!%J1Mkl^l%q*cZ<(VwmmyLW&+Z4!|Ie}?_E%sgpCaO*HZ7amNhX$tX zSs3@s+)-J(E==@{+A99H=A2vz<;rF!@jJ8nj_UF%9)7XEwD$llM(HB#e}K z`yOZgi%no^R*gf+3<5Trf~;n?wnvFgNaLf^&^S~go1=Ith+NTfDBuG2nUImd2qR%` zjY4p0w2X3EHwP;&I7EeX?R0{*%uy1w0f02Xd1q5=I@;6_QULH)GhCz;GR#DJZA73X zsjgZ_HTPkIT#3_upiJ##&b2uG>Nsr@> z;g+dxAZ2V_B+w**jVx72cwqSw*f_o#;Bb!(rP6V`1og4zYC}G36k0@xWv&W#>o3)b zuCEhZQuWG(3se^zP%AbWW3ytJF65A|m)Y92hi9^-M-QhhQko*=FTx zTx?p+Tb4NL=D`B0h??ddW>C$-WMkmHUIYNih1;J7hxMUEi*(_*0v*sW+-JPJ_q7g+ zBTAC~1$k*NbbhtC`d8+YFLYkL@!zTMzyHRc|HbY{mp9(z=|$diKkIECFOZI2%ItQ# Fp8`h~>$v~` literal 0 HcmV?d00001 diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet.infer b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet.infer new file mode 100644 index 0000000000000000000000000000000000000000..9ed714292b9d24890eaa1597cfb74e4890f0d600 GIT binary patch literal 6948 zcma)>&xu#KFa|FIvC+cW1hh0H|3J1$2m}H~+2!*+w`Zm`PceGX zJ=InBo*&_{*97oau(fP zxWRp~YS(V=har05tSC;y&QsfYdiCs;vuLWdU#?t_jHl1uau@Eja`o!eZt?u7rFX-& z^ACLGi=k}lA^P+a@?N{OhJANRtosXJuGRyRKOQ7+hq5~TGuN+QckL^WqUT%Rond2j zd;bu9=Fvw{lD1{*hv>PZwYqcq24C3lvB7mKKU{hHIQm93!m@DJ9Mr9N-$fsn<0mrj zbz^N0QI+0WFfAi7Le)yx!KgPfw zrEal6Q~c;lxVkNY;eeiCz=#Xqoqio4_U*RQs-OPP@4TN+9{!t&$KT0IKY#yI=f9WS zMO8?9sR|oU*!Ww|wDcCTPTby#GvV zUirwj;m46}2S~c;Q#Te`XL>@zjbX6s$2M*#)(`!Dcn!XSzo{!oZHS&fYB_nNa!Yo>zPXGz8uO6&+m=v&R=Cp(U8)gMU$Rw6@C1= zPgt1L0Ur3_qX$P>d{gW@u^aKi^1#y;nS;=GU;W#^|Mo9_{>N;=;k%@B^rv5ttMMJW zNZ-&XLbI5A$cwhn^g~56NiS*QLp0u%!@5z(l8cf_Asg0SO=srXw(|wI_>t|JZ3XF% z=e}?>9d_5=u{%KsTDEmno_k51xh2@|9Kf-N#-+8xdTWiTq1@?m8}I7fx&$=oe4Do7 z=xHp%B~hqBV@GX11`D7F=05u_EE{_Ek5T+ zZX1|#E@QGzvn`5?hG!KYG>^b)q~A;+3E|`FF&_XM0n&)&BZ4%#;DaY4)Mm?aIiSVN zK*I>FmMXioARQ5tY)gYgL}p<@@IqZbt0@iCrgd4PXL^-h3dRBx(nYhwnq<`=Cf$|2 zlvIF%B$B5u7s7l@<1ZWKMBZR;$3<3quaqI?rD8WOS4=M+m9YmK(oNS%skwfpJ^p}k z){t*vK0BuC`jmqPdd1B{>1QBlsQ?Y;cK(;i;yd`&SNwc zu5e6E9XZIN9KK%YW=<6>w;+*MD=TB~IjMVm`$skdLO#{6hH3(DdB5 ztj={PlQ9HEk)OCqWlu6ORUV2?rEm4R^A(RZlTat466iY zY+D&$sf&F0>o$ApXgCkjGAkqUf}9%9EI zq-Sd&0LfsKs(jLWG=|v2f}&L`A0=dFSq$!16d*+0%18=k(Il^9>>-`Z&n>e(8||pd zVl2i!S$0km1pJ5vvae-woeYg?Z0#;kK3#KQBL#)sOvYax5SR=NDP@^w(%5*%6$QNv z0JQpU3w4*pwsM_qCJ{QNf_`w0;hU836ayE^7QwBVTpb{3mPBOib_dO=GhRvn#DeZi zRWqka-*cv!OJwdU^&IwE=b-p8mh|KJ5NNb!Z zfWOuOv%$;!fm0eAFc~)|owy{W@ z5`0#SleM$RAJRB|+jlshL$FpMsRm7?DzR13=qFL5$zcE`R}jF0*sxv#nIUAeXsjZh z&Ar9}7zZ`*ndeoD(p~7v3SMjq+)bF($S^e^dL@%4q97pGH+9H>R70!T!jNRP%&c2* zG#9W;HHEh5(N&q)YdapPQ=ndBfH6;eftJkbnWr_g4UTZMAOOW`gk@hy?2Rx_9i!qSN!7Beg8Om!wp4z!W)Xqy5#Gbb<& zu$8@+ArsXm`Lz?`+(QFX_9BdXX6~e{T^}ZTMs1P*oh2t1Lbwio zAaX^=p@0jr&xA|_Mi>cmYZQW0qh*xSx;a>J!67P?Yp)ZmRgQ|F3jm}6&U@Qh)6upj zAq4<$tinYKkzpp%YfA(uNxW)=DiWNTsKZ!3KC}_sT52Q!cHrk2LO4`A7-w@%ifq6f z7Ziu3qTXqW7vNj=sygI|RSMqZkc!kDO2krAx#ppk);X=53F87HcqbxINW1HsdC({L4B&Z z+K>+$g%%NFnX7```U^YJ^>uz+}X85)6eo+l;QJ#kSMDWrd?|9xQ-G)HLrggK8G08bjXeMF5aoxWj31TwhVN zk}e!qNCz~G4;3%(eXWDy1WM9>FJ0OToj)k9|C{Q`7dkKB`k#lNz5Cj?{%-%Tm$%;J U=|$dee2K^lq@$NI`~Cj^0C;?#3;+NC literal 0 HcmV?d00001 diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.txt b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.txt new file mode 100644 index 0000000000..c0e2f0b2dd --- /dev/null +++ b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.txt @@ -0,0 +1 @@ +0 0 computer time graph diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt new file mode 100644 index 0000000000..788b36d215 --- /dev/null +++ b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt @@ -0,0 +1,9 @@ +0 0 0.49056603773584906 0.5094339622641509 +1 1 0.48214285714285715 0.5178571428571429 +2 2 0.48148148148148145 0.5185185185185185 +3 3 0.5185185185185185 0.48148148148148145 +4 4 0.4716981132075472 0.5283018867924528 +5 5 0.5 0.5 +6 6 0.5 0.5 +7 7 0.49056603773584906 0.5094339622641509 +8 8 0.49056603773584906 0.5094339622641509 diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt.infer b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt.infer new file mode 100644 index 0000000000..979895ea3b --- /dev/null +++ b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt.infer @@ -0,0 +1,2 @@ +#doc name topic proportion ... +0 0 0.4863731656184486 0.5136268343815513 diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_inferencer.mallet b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_inferencer.mallet new file mode 100644 index 0000000000000000000000000000000000000000..c1323ca474a619fd576ba2ea64c7e4baf40a5ba0 GIT binary patch literal 1397 zcma)5O^6gn6n-;3Gdr`{)!858M(_uO^$_R=ahHS%uI>())?QbgHHV#xwLP`dwL4wi zR#ngR7!eOf5CS;_lB)#uq9`b+cu)`_$Djux$wfg3L;^w}E+mJ%>e3R%(!J)$68Ejy4;4oI(H zi{rXhr!-4xT5G=sqOVkiVVAfbt<3~zI+i#j zMsHBT3gz+$6-n(nw_SyeI!j`h2_hs^U{tGi9JVrA0@P}(M=M>B*)8a;M%e`ztc#$f zyb87((wYN(Se#%K={7mbq)Du#nEgVKw!@?|{ja@GK5OivD!} z`1gZbetZMm^S=1%YX6hXMU|5kD8+^OdWu8)2>b^q?s zmtSqWi-CI9Hotx`_t7^OKI=vfB!u?ty%Xcj-)~;-nsv^sCXDpV@4}6?BvZQ1@-D{E z*!N)ch^%LRm#6Q4U~>P7d0+MqFlrbt>BF~gO}`g&?eV_;<@Cot|8`O0P%J>vBTagk z#_9lxq-2DbVSx7>vI%E5Z`;>?FEtmK|~%3CIOfKE z5V5W5hR%gQu{Oi{!0$VbQ|W`88G#M|I7z~5<4U-?=3VmE#}k>(1=#dEWFVupCz*<{ z^(JkYsXSB6i)3hImNGSU@BDwoHYM!LusiU(66qtGvG0n`Hf4*X<> z#|GetKGehl)Wrd4h=b4+i_j8F&=$+k5r?2F4nt2I0avU*UmPuL?r2xzr!zc0fEv{8 z?+|U$=|BTobQ*MT)9KRb5$DqBL%k{?Es`M0LXDqM#U(7IAkQ?OfRq36Y5ELL!U;J$fmRG@jYPv;K;eDIOL|%Y7bbw$I~?k^0;Y zepyT`K4oG;DAD*8EZ8tslI4=BJ5P`;QWR6Nfe{3q$l?eGnscQEq)EobNnU@LK^>2yF4u~F0T4DN-#msLKrh6 z)64kn3%oSK%P(F_ zjjQzxe&2dD-YN+h21*Awv|!T0J*JJu+ce?J~ie07=NPODIt^u1wED8 zg+IgJcE`(Y4J-f}?}&@S_UjixP55<$o7Ma+yuiC+y?n^)s`|I-@NLF%Ucb8au3+gs zgLk!#)1B)-aCF{x{J`)d25%yMEPyk_du8wMj}Qz#nAp~oeVDglOKb|-gKa^h|4`7Z OKN2+2zljzrCf>ia6 Date: Sun, 14 Oct 2018 16:15:49 +1100 Subject: [PATCH 07/16] test_random_seed - test all docs in corpus --- gensim/test/test_ldamallet_wrapper.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py index 615e1a8c91..5dc66a2fbc 100644 --- a/gensim/test/test_ldamallet_wrapper.py +++ b/gensim/test/test_ldamallet_wrapper.py @@ -196,12 +196,12 @@ def test_random_seed(self): num_topics=2, id2word=dictionary, random_seed = SEED) - doc = list(corpus)[0] - self.assertTrue(np.allclose(tm1.word_topics, tm2.word_topics)) and \ - self.assertTrue(np.allclose( - sorted(matutils.sparse2full(tm1[doc], 2)), - sorted(matutils.sparse2full(tm2[doc], 2)), - atol=1e-1)) + self.assertTrue(np.allclose(tm1.word_topics, tm2.word_topics)) + for doc in corpus: + self.assertTrue(np.allclose( + sorted(matutils.sparse2full(tm1[doc], 2)), + sorted(matutils.sparse2full(tm2[doc], 2)), + atol=1e-1)) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From a5db1dfb53dfca2e5ec226c516ffb8f0a85e6095 Mon Sep 17 00:00:00 2001 From: Zohaggie Date: Sun, 14 Oct 2018 22:52:51 +1100 Subject: [PATCH 08/16] Added additional blank line after class declaration --- gensim/test/test_ldamallet_wrapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py index 5dc66a2fbc..ac189b8777 100644 --- a/gensim/test/test_ldamallet_wrapper.py +++ b/gensim/test/test_ldamallet_wrapper.py @@ -203,6 +203,7 @@ def test_random_seed(self): sorted(matutils.sparse2full(tm2[doc], 2)), atol=1e-1)) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From 3ef309fd5e23ad9444a44b9267f39889f5f18aa4 Mon Sep 17 00:00:00 2001 From: Zohaggie Date: Sun, 14 Oct 2018 23:37:13 +1100 Subject: [PATCH 09/16] PEP8 formatting changes only --- gensim/test/test_ldamallet_wrapper.py | 34 +++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py index ac189b8777..b89eecf8fa 100644 --- a/gensim/test/test_ldamallet_wrapper.py +++ b/gensim/test/test_ldamallet_wrapper.py @@ -158,16 +158,16 @@ def testLargeMmapCompressed(self): def test_load_model(self): if not self.mallet_path: - return + return # to conduct the test these directories and files should exist model_save_path = ".\\test_data\\Mallet_TMP\\" model_save_name = model_save_path + "Mallet_pre_rs.mdl" - # the saved models temporary files need to be in a common directory, + # the saved models temporary files need to be in a common directory, # they are being named according to the model name to minimize conflicts prefix = model_save_path + "MTMP\\pre_rs_" if not (os.path.exists(model_save_name) & os.path.exists(prefix + "corpus.mallet")): - logging.warning("Pre-existing model files not found. Skipping test loading of them.") + logging.warning("Pre-existing model files not found. Skipping test loading of them.") return model = ldamodel.LdaModel.load(model_save_name) @@ -183,24 +183,24 @@ def test_load_model(self): def test_random_seed(self): if not self.mallet_path: - return + return # test that 2 models created with the same random_seed are equal in their topics treatment - SEED = 10 - tm1 = ldamallet.LdaMallet(self.mallet_path, - corpus=corpus, - num_topics=2, - id2word=dictionary, - random_seed = SEED) - tm2 = ldamallet.LdaMallet(self.mallet_path, - corpus=corpus, - num_topics=2, - id2word=dictionary, - random_seed = SEED) + SEED = 10 + tm1 = ldamallet.LdaMallet(self.mallet_path, + corpus=corpus, + num_topics=2, + id2word=dictionary, + random_seed=SEED) + tm2 = ldamallet.LdaMallet(self.mallet_path, + corpus=corpus, + num_topics=2, + id2word=dictionary, + random_seed=SEED) self.assertTrue(np.allclose(tm1.word_topics, tm2.word_topics)) for doc in corpus: self.assertTrue(np.allclose( - sorted(matutils.sparse2full(tm1[doc], 2)), - sorted(matutils.sparse2full(tm2[doc], 2)), + sorted(matutils.sparse2full(tm1[doc], 2)), + sorted(matutils.sparse2full(tm2[doc], 2)), atol=1e-1)) From 53136a1e1ee0aba36abc5861eb916ce99da7089f Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Wed, 9 Jan 2019 15:41:41 +0500 Subject: [PATCH 10/16] fix pep8 --- gensim/models/wrappers/ldamallet.py | 4 ++-- gensim/test/test_ldamallet_wrapper.py | 32 +++++++++++++++------------ 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 73f4e7d8bc..8007720f39 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -278,7 +278,7 @@ def train(self, corpus): if self.random_seed is not None: cmd += ' --random-seed ' + str(self.random_seed) - + cmd = cmd % ( self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, @@ -580,7 +580,7 @@ def load(cls, *args, **kwargs): model = super(LdaMallet, cls).load(*args, **kwargs) if not hasattr(model, 'random_seed'): model.random_seed = None - + return model diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py index afd8d118c4..0fcf04ad18 100644 --- a/gensim/test/test_ldamallet_wrapper.py +++ b/gensim/test/test_ldamallet_wrapper.py @@ -184,12 +184,12 @@ def test_load_model(self): # the saved models temporary files need to be in a common directory, # they are being named according to the model name to minimize conflicts prefix = model_save_path + "MTMP\\pre_rs_" - + if not (os.path.exists(model_save_name) & os.path.exists(prefix + "corpus.mallet")): logging.warning("Pre-existing model files not found. Skipping test loading of them.") return model = ldamodel.LdaModel.load(model_save_name) - + # Test loaded model works on a new corpus, made of previously unseen documents. other_texts = [['computer', 'time', 'graph'], ['survey', 'response', 'eps'], @@ -197,24 +197,28 @@ def test_load_model(self): other_corpus = [dictionary.doc2bow(text) for text in other_texts] unseen_doc = other_corpus[0] - vector = model[unseen_doc] # get topic probability distribution for a document + vector = model[unseen_doc] # get topic probability distribution for a document self.assertTrue(sum(n for _, n in vector) == 1) - + def test_random_seed(self): if not self.mallet_path: return # test that 2 models created with the same random_seed are equal in their topics treatment SEED = 10 - tm1 = ldamallet.LdaMallet(self.mallet_path, - corpus=corpus, - num_topics=2, - id2word=dictionary, - random_seed=SEED) - tm2 = ldamallet.LdaMallet(self.mallet_path, - corpus=corpus, - num_topics=2, - id2word=dictionary, - random_seed=SEED) + tm1 = ldamallet.LdaMallet( + self.mallet_path, + corpus=corpus, + num_topics=2, + id2word=dictionary, + random_seed=SEED + ) + tm2 = ldamallet.LdaMallet( + self.mallet_path, + corpus=corpus, + num_topics=2, + id2word=dictionary, + random_seed=SEED + ) self.assertTrue(np.allclose(tm1.word_topics, tm2.word_topics)) for doc in corpus: self.assertTrue(np.allclose( From 0b3d7aaf2efd184a5671416b68fb363f6826dbe4 Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Thu, 10 Jan 2019 14:30:52 +0500 Subject: [PATCH 11/16] use 0 as default seed (according to Mallet doc) + pin seed for inference too --- gensim/models/wrappers/ldamallet.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 8007720f39..133b7d2f88 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -78,7 +78,7 @@ class LdaMallet(utils.SaveLoad, basemodel.BaseTopicModel): """ def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=None, workers=4, prefix=None, - optimize_interval=0, iterations=1000, topic_threshold=0.0, random_seed=None): + optimize_interval=0, iterations=1000, topic_threshold=0.0, random_seed=0): """ Parameters @@ -105,7 +105,7 @@ def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=N topic_threshold : float, optional Threshold of the probability above which we consider a topic. random_seed: int, optional - Random seed to ensure consistent results. + Random seed to ensure consistent results, if 0 - use system clock. """ self.mallet_path = mallet_path @@ -265,7 +265,7 @@ def convert_input(self, corpus, infer=False, serialize_corpus=True): def train(self, corpus): """Train Mallet LDA. - Parameters + Parameterstrain-topics ---------- corpus : iterable of iterable of (int, int) Corpus in BoW format @@ -274,15 +274,12 @@ def train(self, corpus): self.convert_input(corpus, infer=False) cmd = self.mallet_path + ' train-topics --input %s --num-topics %s --alpha %s --optimize-interval %s '\ '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s '\ - '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s' - - if self.random_seed is not None: - cmd += ' --random-seed ' + str(self.random_seed) + '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s --random-seed %s' cmd = cmd % ( self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, - self.finferencer(), self.topic_threshold + self.finferencer(), self.topic_threshold, str(self.random_seed) ) # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory logger.info("training MALLET LDA with %s", cmd) @@ -319,10 +316,10 @@ def __getitem__(self, bow, iterations=100): self.convert_input(bow, infer=True) cmd = \ self.mallet_path + ' infer-topics --input %s --inferencer %s ' \ - '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' + '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s --random-seed %s' cmd = cmd % ( self.fcorpusmallet() + '.infer', self.finferencer(), - self.fdoctopics() + '.infer', iterations, self.topic_threshold + self.fdoctopics() + '.infer', iterations, self.topic_threshold, str(self.random_seed) ) logger.info("inferring topics with MALLET LDA '%s'", cmd) check_output(args=cmd, shell=True) From 7fa506965d2604e08fb1b9e631a4d49687965fb6 Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Thu, 10 Jan 2019 14:31:37 +0500 Subject: [PATCH 12/16] make seed test strict --- gensim/test/test_ldamallet_wrapper.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py index 0fcf04ad18..020da11ea6 100644 --- a/gensim/test/test_ldamallet_wrapper.py +++ b/gensim/test/test_ldamallet_wrapper.py @@ -203,28 +203,36 @@ def test_load_model(self): def test_random_seed(self): if not self.mallet_path: return + # test that 2 models created with the same random_seed are equal in their topics treatment SEED = 10 + NUM_TOPICS = 10 + ITER = 500 + tm1 = ldamallet.LdaMallet( self.mallet_path, corpus=corpus, - num_topics=2, + num_topics=NUM_TOPICS, id2word=dictionary, - random_seed=SEED + random_seed=SEED, + iterations=ITER, ) + tm2 = ldamallet.LdaMallet( self.mallet_path, corpus=corpus, - num_topics=2, + num_topics=NUM_TOPICS, id2word=dictionary, - random_seed=SEED + random_seed=SEED, + iterations=ITER, ) self.assertTrue(np.allclose(tm1.word_topics, tm2.word_topics)) + for doc in corpus: - self.assertTrue(np.allclose( - sorted(matutils.sparse2full(tm1[doc], 2)), - sorted(matutils.sparse2full(tm2[doc], 2)), - atol=1e-1)) + tm1_vector = matutils.sparse2full(tm1[doc], NUM_TOPICS) + tm2_vector = matutils.sparse2full(tm2[doc], NUM_TOPICS) + + self.assertTrue(np.allclose(tm1_vector, tm2_vector)) if __name__ == '__main__': From b001cd31de0e824eca9335c87d028c14a73cf336 Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Thu, 10 Jan 2019 14:42:59 +0500 Subject: [PATCH 13/16] remove useless test --- gensim/test/test_ldamallet_wrapper.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py index 020da11ea6..2ad1ccb9c9 100644 --- a/gensim/test/test_ldamallet_wrapper.py +++ b/gensim/test/test_ldamallet_wrapper.py @@ -175,31 +175,6 @@ def testLargeMmapCompressed(self): # test loading the large model arrays with mmap self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r') - def test_load_model(self): - if not self.mallet_path: - return - # to conduct the test these directories and files should exist - model_save_path = ".\\test_data\\Mallet_TMP\\" - model_save_name = model_save_path + "Mallet_pre_rs.mdl" - # the saved models temporary files need to be in a common directory, - # they are being named according to the model name to minimize conflicts - prefix = model_save_path + "MTMP\\pre_rs_" - - if not (os.path.exists(model_save_name) & os.path.exists(prefix + "corpus.mallet")): - logging.warning("Pre-existing model files not found. Skipping test loading of them.") - return - model = ldamodel.LdaModel.load(model_save_name) - - # Test loaded model works on a new corpus, made of previously unseen documents. - other_texts = [['computer', 'time', 'graph'], - ['survey', 'response', 'eps'], - ['human', 'system', 'computer']] - other_corpus = [dictionary.doc2bow(text) for text in other_texts] - - unseen_doc = other_corpus[0] - vector = model[unseen_doc] # get topic probability distribution for a document - self.assertTrue(sum(n for _, n in vector) == 1) - def test_random_seed(self): if not self.mallet_path: return From f3bd92755a1cbab765c0873085dcc0d36858673d Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Thu, 10 Jan 2019 14:45:04 +0500 Subject: [PATCH 14/16] remove not used data --- .../Mallet_TMP/MTMP/pre_rs_corpus.mallet | Bin 7496 -> 0 bytes .../Mallet_TMP/MTMP/pre_rs_corpus.mallet.infer | Bin 6948 -> 0 bytes .../test_data/Mallet_TMP/MTMP/pre_rs_corpus.txt | 1 - .../Mallet_TMP/MTMP/pre_rs_doctopics.txt | 9 --------- .../Mallet_TMP/MTMP/pre_rs_doctopics.txt.infer | 2 -- .../Mallet_TMP/MTMP/pre_rs_inferencer.mallet | Bin 1397 -> 0 bytes .../Mallet_TMP/MTMP/pre_rs_topickeys.txt | 2 -- .../test/test_data/Mallet_TMP/Mallet_pre_rs.mdl | Bin 1361 -> 0 bytes 8 files changed, 14 deletions(-) delete mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet delete mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet.infer delete mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.txt delete mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt delete mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt.infer delete mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_inferencer.mallet delete mode 100644 gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_topickeys.txt delete mode 100644 gensim/test/test_data/Mallet_TMP/Mallet_pre_rs.mdl diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet deleted file mode 100644 index 0c504c16fab90664bbcb3c5eddffd3743b51a2c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7496 zcmb7}&yOVMS;wpESO3`EWbN2s43@1#LPQ}mU_yxG10=C57LAFO-6WD14sTb#-CdsQ zs@|&V=^i5?bAT)204N6}gc1%YQVvKY@&PVLJ|dBSfGZ#*B&5i><@0^s?wQ_M6Rk(; znd++deSUnO@AKoW`QHCTS=UCtR1_yGS5>}0=`Yv5JGobPy{ij%Qg7#j;28r-I~7Usk8T>$>HeZvEQB=!La!&#TDEi}QfMwyXIjCFlu8lq=$4_M5 z$(P&KT?*3OpZxnv-~Pb;O%e_6MQK;Q>vwBV9&c0qCLk~Te*w7;M*en`S50y5XJ=6= zr2FXEBapu~eFspnG_Bpr%b3YoG+DO|!1SdDxR;JxGxXjH56wb0=3TRCi^JADxMqTO zYcn*$(3fu-dWH9+jfX(j_9pkR*Vl`Fi5;`j0`A&NL+)WT+H(QB;)nN+6g+{%0rG3f z+1t^bMcZuq<^8@{uLa1^*L`bVK7T}yEBx_pq7|3dk>ZYMCDPxUU9qZNm8Y{sar(_S z@4e2ZWaZWZZc%Se`?k68rw;fJj#}vm4oBsvgZ5T z70_$c`flCS9hZ5xX)pYxE-t&?ulgwM%M}mOO{eXNUjt6ow_d@^7Oh(^S-dLirtP+a z=%S788r76zC--r`Ui|Ib0Y+tY6V#BRN5wd#BS_wyg-qX++C;_-L%@((_I<@}eE z+o%e(m#VPwgpHqkc75kxzWB9g<0!cwWks{8``-&*!i`N`LYdwAJ#t{+|Lvc@@z-}= z{NP`A+vgCR@X?PzMOl>W(D3|C>RG;P89!7j2$z-h%ghrK{A~Iwa_a?19CZo*JK?Q6IapOY6gGEvFkjcRU(=-FHoO;b(70Klga2 zynb*uJfQ0ozU!z~H>L}*7b}j+vhFJAXe&S^q4x(ELa(3O;*7!B>~Az3)TQ zeDy(d?(4(1YmgO_%J0+-jt3=c-pBjTwC1&^whcdyY}-T9Mz363Xw%CR8mxQsZjU!%I9(uV& zW-s*p*Z=e{|LTW7_ySvS__oUQZ~u;5jc+k&bPdBq7$~Nm2FK{obbZCZkzO+B_0e!! z_RB^gOD;;L$*f;`4JH$}UbnvB7GJV$v#A&ihErcS1`~GI-myDD2wFCERi1lEowzyJ zZyms~h~Y+S`{l-lmxgp_5?XmzZC!?TJHng?Jt z(08WPgz$0oh!21b0BOMT0YMsE@WGP-YO{Gc@6lprpkaVkqsn$ENC(6u+b~2SBD1g{ zc&5oPt4R&irgd4PXMCWSf-%E{bk=OKCRsFyNw;MuB^96`3G#I1OqdTD4$DTI$Q$f! zxyWkoMHymVDt5zi!G!Bk8GEpSZn{oNo&6@-Gd&tDeW1nm~|rDPI9Z2Oc)p5Xr7` zGTbtD*sUpFb0o)losmtwJ&y?&{3i-RL6HjeAl=7~KSCVY2MvuUuJIK{E8`M+m_(G8!z4#x?S%`# zXITur3BkVgk~=8O9YNE>#OZfVdRok53StyDs-a@HM zT~gn0EZss7;RNDVsMrr@B?jW8-l^?`h88NxWM4o`*l<~{f}B|WKej3@z`cq??stWi zStbu-dbgqN-wr*8Qeu?ING;7ED&^CV8jK_R(g8235_*XXTO~Oes4^gQl#%+5wMn<3 z$*0)Qn#9QpUZDi}^j;N=OubDW1rE_rjbd>p3N0u?TH{m#_-h?NdvF}t#l){-|aHWsO-1fLb-Wa%vOdm5+j`WEMN z3an*FszDQ}BDOLb{SY-8?FUe}f&do8hV>lC3?Z9EVNO5faZp1&^Sq7|-G#2K z;N`c#U4>ZtHxqxM=DYQk84ryYqc6p>u zfqIPrR(awJv}BggJgu2+aD<~JH)mn;PMvWn2Xiwx*AOo?!vbFxVn$M{D#TlMDKqyi zg}b1{Z;3Run!%J1Mkl^l%q*cZ<(VwmmyLW&+Z4!|Ie}?_E%sgpCaO*HZ7amNhX$tX zSs3@s+)-J(E==@{+A99H=A2vz<;rF!@jJ8nj_UF%9)7XEwD$llM(HB#e}K z`yOZgi%no^R*gf+3<5Trf~;n?wnvFgNaLf^&^S~go1=Ith+NTfDBuG2nUImd2qR%` zjY4p0w2X3EHwP;&I7EeX?R0{*%uy1w0f02Xd1q5=I@;6_QULH)GhCz;GR#DJZA73X zsjgZ_HTPkIT#3_upiJ##&b2uG>Nsr@> z;g+dxAZ2V_B+w**jVx72cwqSw*f_o#;Bb!(rP6V`1og4zYC}G36k0@xWv&W#>o3)b zuCEhZQuWG(3se^zP%AbWW3ytJF65A|m)Y92hi9^-M-QhhQko*=FTx zTx?p+Tb4NL=D`B0h??ddW>C$-WMkmHUIYNih1;J7hxMUEi*(_*0v*sW+-JPJ_q7g+ zBTAC~1$k*NbbhtC`d8+YFLYkL@!zTMzyHRc|HbY{mp9(z=|$diKkIECFOZI2%ItQ# Fp8`h~>$v~` diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet.infer b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.mallet.infer deleted file mode 100644 index 9ed714292b9d24890eaa1597cfb74e4890f0d600..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6948 zcma)>&xu#KFa|FIvC+cW1hh0H|3J1$2m}H~+2!*+w`Zm`PceGX zJ=InBo*&_{*97oau(fP zxWRp~YS(V=har05tSC;y&QsfYdiCs;vuLWdU#?t_jHl1uau@Eja`o!eZt?u7rFX-& z^ACLGi=k}lA^P+a@?N{OhJANRtosXJuGRyRKOQ7+hq5~TGuN+QckL^WqUT%Rond2j zd;bu9=Fvw{lD1{*hv>PZwYqcq24C3lvB7mKKU{hHIQm93!m@DJ9Mr9N-$fsn<0mrj zbz^N0QI+0WFfAi7Le)yx!KgPfw zrEal6Q~c;lxVkNY;eeiCz=#Xqoqio4_U*RQs-OPP@4TN+9{!t&$KT0IKY#yI=f9WS zMO8?9sR|oU*!Ww|wDcCTPTby#GvV zUirwj;m46}2S~c;Q#Te`XL>@zjbX6s$2M*#)(`!Dcn!XSzo{!oZHS&fYB_nNa!Yo>zPXGz8uO6&+m=v&R=Cp(U8)gMU$Rw6@C1= zPgt1L0Ur3_qX$P>d{gW@u^aKi^1#y;nS;=GU;W#^|Mo9_{>N;=;k%@B^rv5ttMMJW zNZ-&XLbI5A$cwhn^g~56NiS*QLp0u%!@5z(l8cf_Asg0SO=srXw(|wI_>t|JZ3XF% z=e}?>9d_5=u{%KsTDEmno_k51xh2@|9Kf-N#-+8xdTWiTq1@?m8}I7fx&$=oe4Do7 z=xHp%B~hqBV@GX11`D7F=05u_EE{_Ek5T+ zZX1|#E@QGzvn`5?hG!KYG>^b)q~A;+3E|`FF&_XM0n&)&BZ4%#;DaY4)Mm?aIiSVN zK*I>FmMXioARQ5tY)gYgL}p<@@IqZbt0@iCrgd4PXL^-h3dRBx(nYhwnq<`=Cf$|2 zlvIF%B$B5u7s7l@<1ZWKMBZR;$3<3quaqI?rD8WOS4=M+m9YmK(oNS%skwfpJ^p}k z){t*vK0BuC`jmqPdd1B{>1QBlsQ?Y;cK(;i;yd`&SNwc zu5e6E9XZIN9KK%YW=<6>w;+*MD=TB~IjMVm`$skdLO#{6hH3(DdB5 ztj={PlQ9HEk)OCqWlu6ORUV2?rEm4R^A(RZlTat466iY zY+D&$sf&F0>o$ApXgCkjGAkqUf}9%9EI zq-Sd&0LfsKs(jLWG=|v2f}&L`A0=dFSq$!16d*+0%18=k(Il^9>>-`Z&n>e(8||pd zVl2i!S$0km1pJ5vvae-woeYg?Z0#;kK3#KQBL#)sOvYax5SR=NDP@^w(%5*%6$QNv z0JQpU3w4*pwsM_qCJ{QNf_`w0;hU836ayE^7QwBVTpb{3mPBOib_dO=GhRvn#DeZi zRWqka-*cv!OJwdU^&IwE=b-p8mh|KJ5NNb!Z zfWOuOv%$;!fm0eAFc~)|owy{W@ z5`0#SleM$RAJRB|+jlshL$FpMsRm7?DzR13=qFL5$zcE`R}jF0*sxv#nIUAeXsjZh z&Ar9}7zZ`*ndeoD(p~7v3SMjq+)bF($S^e^dL@%4q97pGH+9H>R70!T!jNRP%&c2* zG#9W;HHEh5(N&q)YdapPQ=ndBfH6;eftJkbnWr_g4UTZMAOOW`gk@hy?2Rx_9i!qSN!7Beg8Om!wp4z!W)Xqy5#Gbb<& zu$8@+ArsXm`Lz?`+(QFX_9BdXX6~e{T^}ZTMs1P*oh2t1Lbwio zAaX^=p@0jr&xA|_Mi>cmYZQW0qh*xSx;a>J!67P?Yp)ZmRgQ|F3jm}6&U@Qh)6upj zAq4<$tinYKkzpp%YfA(uNxW)=DiWNTsKZ!3KC}_sT52Q!cHrk2LO4`A7-w@%ifq6f z7Ziu3qTXqW7vNj=sygI|RSMqZkc!kDO2krAx#ppk);X=53F87HcqbxINW1HsdC({L4B&Z z+K>+$g%%NFnX7```U^YJ^>uz+}X85)6eo+l;QJ#kSMDWrd?|9xQ-G)HLrggK8G08bjXeMF5aoxWj31TwhVN zk}e!qNCz~G4;3%(eXWDy1WM9>FJ0OToj)k9|C{Q`7dkKB`k#lNz5Cj?{%-%Tm$%;J U=|$dee2K^lq@$NI`~Cj^0C;?#3;+NC diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.txt b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.txt deleted file mode 100644 index c0e2f0b2dd..0000000000 --- a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_corpus.txt +++ /dev/null @@ -1 +0,0 @@ -0 0 computer time graph diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt deleted file mode 100644 index 788b36d215..0000000000 --- a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt +++ /dev/null @@ -1,9 +0,0 @@ -0 0 0.49056603773584906 0.5094339622641509 -1 1 0.48214285714285715 0.5178571428571429 -2 2 0.48148148148148145 0.5185185185185185 -3 3 0.5185185185185185 0.48148148148148145 -4 4 0.4716981132075472 0.5283018867924528 -5 5 0.5 0.5 -6 6 0.5 0.5 -7 7 0.49056603773584906 0.5094339622641509 -8 8 0.49056603773584906 0.5094339622641509 diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt.infer b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt.infer deleted file mode 100644 index 979895ea3b..0000000000 --- a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_doctopics.txt.infer +++ /dev/null @@ -1,2 +0,0 @@ -#doc name topic proportion ... -0 0 0.4863731656184486 0.5136268343815513 diff --git a/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_inferencer.mallet b/gensim/test/test_data/Mallet_TMP/MTMP/pre_rs_inferencer.mallet deleted file mode 100644 index c1323ca474a619fd576ba2ea64c7e4baf40a5ba0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1397 zcma)5O^6gn6n-;3Gdr`{)!858M(_uO^$_R=ahHS%uI>())?QbgHHV#xwLP`dwL4wi zR#ngR7!eOf5CS;_lB)#uq9`b+cu)`_$Djux$wfg3L;^w}E+mJ%>e3R%(!J)$68Ejy4;4oI(H zi{rXhr!-4xT5G=sqOVkiVVAfbt<3~zI+i#j zMsHBT3gz+$6-n(nw_SyeI!j`h2_hs^U{tGi9JVrA0@P}(M=M>B*)8a;M%e`ztc#$f zyb87((wYN(Se#%K={7mbq)Du#nEgVKw!@?|{ja@GK5OivD!} z`1gZbetZMm^S=1%YX6hXMU|5kD8+^OdWu8)2>b^q?s zmtSqWi-CI9Hotx`_t7^OKI=vfB!u?ty%Xcj-)~;-nsv^sCXDpV@4}6?BvZQ1@-D{E z*!N)ch^%LRm#6Q4U~>P7d0+MqFlrbt>BF~gO}`g&?eV_;<@Cot|8`O0P%J>vBTagk z#_9lxq-2DbVSx7>vI%E5Z`;>?FEtmK|~%3CIOfKE z5V5W5hR%gQu{Oi{!0$VbQ|W`88G#M|I7z~5<4U-?=3VmE#}k>(1=#dEWFVupCz*<{ z^(JkYsXSB6i)3hImNGSU@BDwoHYM!LusiU(66qtGvG0n`Hf4*X<> z#|GetKGehl)Wrd4h=b4+i_j8F&=$+k5r?2F4nt2I0avU*UmPuL?r2xzr!zc0fEv{8 z?+|U$=|BTobQ*MT)9KRb5$DqBL%k{?Es`M0LXDqM#U(7IAkQ?OfRq36Y5ELL!U;J$fmRG@jYPv;K;eDIOL|%Y7bbw$I~?k^0;Y zepyT`K4oG;DAD*8EZ8tslI4=BJ5P`;QWR6Nfe{3q$l?eGnscQEq)EobNnU@LK^>2yF4u~F0T4DN-#msLKrh6 z)64kn3%oSK%P(F_ zjjQzxe&2dD-YN+h21*Awv|!T0J*JJu+ce?J~ie07=NPODIt^u1wED8 zg+IgJcE`(Y4J-f}?}&@S_UjixP55<$o7Ma+yuiC+y?n^)s`|I-@NLF%Ucb8au3+gs zgLk!#)1B)-aCF{x{J`)d25%yMEPyk_du8wMj}Qz#nAp~oeVDglOKb|-gKa^h|4`7Z OKN2+2zljzrCf>ia6 Date: Thu, 10 Jan 2019 14:51:22 +0500 Subject: [PATCH 15/16] fix typo --- gensim/models/wrappers/ldamallet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 133b7d2f88..5eb66eba22 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -265,7 +265,7 @@ def convert_input(self, corpus, infer=False, serialize_corpus=True): def train(self, corpus): """Train Mallet LDA. - Parameterstrain-topics + Parameters ---------- corpus : iterable of iterable of (int, int) Corpus in BoW format From 86ede215e7bd100fd6d46be0299303c85d096645 Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Thu, 10 Jan 2019 15:22:01 +0500 Subject: [PATCH 16/16] fill random_seed=0 in load --- gensim/models/wrappers/ldamallet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 5eb66eba22..eee1a542a6 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -576,7 +576,7 @@ def load(cls, *args, **kwargs): """ model = super(LdaMallet, cls).load(*args, **kwargs) if not hasattr(model, 'random_seed'): - model.random_seed = None + model.random_seed = 0 return model