-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_script.py
69 lines (43 loc) · 1.83 KB
/
train_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#for keeping the log while training huge models may be we will require it further when we use LSI or LDA
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# gensim lib the key to everything ...yeah but except BM25
import gensim
from gensim import corpora
# filtering document before training our model. Using pattern , we will convert HTML to plain text
from pattern.web import plaintext
import os
# output file
documents = []
for fn in os.listdir('.'):
if os.path.isfile(fn):
fptr = open(fn,'r+')
html = fptr.read()
p_text = plaintext(html, keep={'h1':[], 'h2':[], 'strong':[], 'div':[], 'p':[]})
p_text = p_text.replace('<',' ').replace('>',' ').replace('/','').replace('h1',' ').replace('h2',' ').replace('strong',' ').replace('div',' ').replace('p',' ').lower()
documents.append(p_text)
fptr.close()
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below = 1, no_above=0.5, keep_n=50000)
dictionary.save('dictionary.dict')
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('Corpus.mm', corpus)
tfidf = gensim.models.tfidfmodel.TfidfModel(corpus)
tfidf.save('tf-idfmodel.tfidf_model')
dict_tfidf = {}
for fn in os.listdir('.'):
if '.html' in fn:
fptr = open(fn,'r+')
doc = fptr.read()
new_vec = dictionary.doc2bow(doc.lower().split())
tfidf_vec = tfidf[new_vec]
dict_tfidf[fn] = tfidf_vec
fptr.close()
target = open('out.tsv', 'a+')
for key, value in dict_tfidf.iteritems():
target.write(key + '\t')
target.write(str(value))
target.write('\n')
target.close()