-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathuser_lda.py
76 lines (60 loc) · 2.33 KB
/
user_lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
This file is used to create a dictionary from the Medium corpus stored in ES, and then use that dictionary to train a LDA model over this corpus
(1) server_addr : ES server address
(2) dict_name : name of created dictionary
(3) model_name : name of created LDA model
(4) min_docs : min number of documents the word should appear in the corpus
(5) max_docs : max number of docs the word should appear in the corpus (in fraction)
(6) n_topics : number of topics the model is to be trained on
(7) n_passes : number of passes the model is to be trained for
(8) n_iters : number of iterations per pass
"""
from elasticsearch import Elasticsearch, helpers
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import logging
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
es = Elasticsearch(server_addr, timeout=300)
p_stem = PorterStemmer()
query = {
"query": {
"match_all": {}
}
}
docs = []
texts = []
stop_list = stopwords.words('english')
stop_list.extend(['continue', 'reading', 'medium'])
print "data extraction from es..."
'''
result1 = helpers.scan(es, query, scroll=u'5m', index='new_data_dump', doc_type='medium')
for res in result1:
docs.append(res["_source"]["medium_title"] + res["_source"]["medium_summary"])
'''
result2 = helpers.scan(es, query, scroll=u'5m', index='data_dump', doc_type='medium')
for res in result2:
docs.append(res["_source"]["medium_title"] + res["_source"]["medium_summary"])
"""
doc_len = []
for d in docs:
doc_len.append(len(d))
print "average document length is: %s" % (sum(doc_len) / len(doc_len))
print "combining documents in sets of 7..."
for i in range(0, len(docs), 7):
temp_list = []
for j in range(i, i+7):
temp_list.extend(docs[j])
texts.append(temp_list)
"""
print "removing stopwords and stemming..."
new_texts = [[p_stem.stem(tok) for tok in doc if tok not in stop_list] for doc in docs]
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = Dictionary(new_texts)
dictionary.filter_extremes(no_below=min_docs, no_above=max_docs, keep_n=None)
dictionary.compactify()
dictionary.save(dict_name)
corpus = [dictionary.doc2bow(text) for text in new_texts]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics, passes=n_passes, iterations=n_iters)
lda.save(model_name)
print lda.print_topics(10)