-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathnews_trainer.py
158 lines (136 loc) · 5.37 KB
/
news_trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import random
import re
import json
import gc
import nltk
import pickle
from collections import Counter
from itertools import izip, chain
from nltk.stem.porter import PorterStemmer
from nltk.tokenize.regexp import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from nltk.classify.naivebayes import NaiveBayesClassifier
from pprint import pprint
from collections import defaultdict
from sklearn.svm import LinearSVC
porter_stemmer = PorterStemmer()
sw = stopwords.words('english')
skclassifier = SklearnClassifier(LinearSVC(multi_class='ovr'))
gc.disable()
RECORDS_TO_TRAIN = 10000
global CLASSIFIER
global CLASSIFIERS
global CUR_CL
CLASSIFIERS = ('sklearnLinSVC', 'BernoulliNB', 'MultinomialNB') # ['NaiveBayesClassifier']
CLASSIFIER = 'BernoulliNB'
CUR_CL = None
classification_name = 'news_based_' + CLASSIFIER
current_path = os.path.dirname(os.path.realpath(__file__))
# for nltk to find it's datasets
nltk.data.path.append(os.path.join(current_path, 'datasets'))
# Location to saved corpus
news_corpus_dir = os.path.join(current_path, 'news_data')
# Location of the classificator data
pickles_dir = os.path.join(current_path, "classifier_pickles")
def grouper(n, iterable, padvalue=None):
"grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
return izip(*[iter(iterable)] * n)
def preprocess_text(text, stem=False):
text = re.sub(r'(@[a-zA-Z0-9]+)|(http://[a-zA-Z0-9]*[.][a-zA-Z]+[/a-zA-Z0-9]*)|([".#]+)', '', text)
if stem:
return porter_stemmer.stem(text)
else:
return text
def get_text_words(text, stopwords=sw):
text = preprocess_text(text)
user_set = set(["http", "://"])
text_words = set(wordpunct_tokenize(text.lower()))
text_words = text_words.difference(stopwords)
text_words = text_words.difference(user_set)
text_words = [w for w in text_words if len(w) > 2]
return text_words
def word_indicator(text, **kwargs):
if CLASSIFIER == 'MultinomialNB':
features = dict(Counter(wordpunct_tokenize(text.lower())))
for el in features.keys():
if el in sw or el in ["http", "://"]:
del features[el]
else:
features = defaultdict(list)
text_words = get_text_words(text, **kwargs)
for w in text_words:
features[w] = True
return features
def features_from_text(text, label, **kwargs):
features = word_indicator(text, **kwargs)
return (features, label)
def train(records):
global CUR_CL
train_data = []
for record in records:
text = record[1]
class_label = record[0]
feats = features_from_text(text, class_label, stopwords=sw)
train_data.append(feats)
if CUR_CL is None:
if CLASSIFIER == 'NaiveBayesClassifier':
classifier = NaiveBayesClassifier.train(train_data)
elif CLASSIFIER == 'sklearnLinSVC':
pipeline = Pipeline([('tfidf', TfidfTransformer()),
('chi2', SelectKBest(chi2, k=1000)),
('nb', LinearSVC(multi_class='ovr'))])
classifier = SklearnClassifier(pipeline).train(train_data)
elif CLASSIFIER == 'BernoulliNB':
pipeline = Pipeline([('tfidf', TfidfTransformer()),
('chi2', SelectKBest(chi2, k=1000)),
('nb', BernoulliNB())])
classifier = SklearnClassifier(pipeline).train(train_data)
elif CLASSIFIER == 'MultinomialNB':
pipeline = Pipeline([('tfidf', TfidfTransformer()),
('chi2', SelectKBest(chi2, k=1000)),
('nb', MultinomialNB())])
classifier = SklearnClassifier(pipeline).train(train_data)
print CLASSIFIER
CUR_CL = classifier
else:
print 'Partial fitting.. \n\n'
CUR_CL.train(train_data)
f = open("%s/%s.pickle" % (pickles_dir, 'news_based_' + CLASSIFIER), 'wb')
pickle.dump(CUR_CL, f)
f.close()
print"%s/%s.pickle saved" % (pickles_dir, 'news_based_' + CLASSIFIER)
gc.collect()
def get_filenames():
files = []
# All files in all categories
for dirpath, dirnames, filenames in os.walk(news_corpus_dir):
for filename in [f for f in filenames]:
files.append((os.path.split(dirpath)[1], os.path.join(dirpath, filename)))
random.shuffle(files)
return files
if __name__ == '__main__':
files = get_filenames()
global CLASSIFIER
global CLASSIFIERS
global CUR_CL
# for each classificator
for cl in CLASSIFIERS:
CLASSIFIER = cl
CUR_CL = None
# Train the classificator
# for records in grouper(RECORDS_TO_TRAIN, files): # this one is useful for partial_fit() which is not used by NLTK
preprocessed_records = []
print len(files)
for category, filepath in files:
with open(filepath) as data_file:
data = json.load(data_file)
data['text'] = data['body'] + ' ' + data['title']
preprocessed_records.append((data['category'],
data['text']))
train(preprocessed_records)