-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMain.py
145 lines (124 loc) · 4.84 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from DataSetReader import DataSetReader
from PreProcess import PreProcess
from Vectorizer import Vectorizer
from Classify import Classify
from NewClassifier import Classifier
import itertools
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict
def clean_string(str):
newstr = ''
noise = ['[',']','{','}',"'",'(',')',',']
for c in str:
if c == ' ':
newstr+='\n'
elif c not in noise:
newstr+= c
return newstr
def get_plot_arrs(params,acc_num, items):
plt_arr = [acc[0] for acc in items]
plt_arr = [plt_arr[i] for i in range (0,acc_num)]
ticks_arr = [clean_string(acc[1].__str__())+clean_string(acc[2].__str__())+'\n Parameters: '+
clean_string(params.__str__())for acc in v]
ticks_arr = [ticks_arr[i] for i in range (0,acc_num)]
return plt_arr,ticks_arr
def plot_clfs(width=0.15,max_num=3,acc_sep=1.2,param_sep=2.8):
# width: width of bar
# max_num: number of accuracies to plot per parameter
# acc_sep: seperation between accuracies for a prameter
# param_sep: seperation between parameters
for key, value in clf_dict.items():
print(key)
plt_acc = []
plt_tks = []
my_dpi = 192
fig = plt.figure(figsize=(4096 / my_dpi, 2160 / my_dpi), dpi=my_dpi)
acc_sep = acc_sep
param_sep = param_sep
acc_sep += len(value)/20
param_sep += len(value)/10
for k, v in value.items():
plt.title(key.__str__())
plt_arr, plt_arr_ticks = get_plot_arrs(k,max_num, v)
print(plt_arr, plt_arr_ticks)
if not plt_acc:
plt_acc = plt_arr
plt_tks = plt_arr_ticks
else:
plt_acc += plt_arr
plt_tks += plt_arr_ticks
ind = np.array(1)
x = 1
for i in range(1, len(plt_acc)):
if ((i) % max_num == 0):
x += param_sep * width
else:
x += acc_sep * width
ind = np.append(ind, x)
ind = ind[0:len(plt_acc)]
colors =['#009688','#35a79c','#54b2a9','#65c3ba','#83d0c9','#8fd4ce','#9bd9d3']
colors =[colors[i] for i in range(0,max_num)]
plt.bar(ind, plt_acc, width=width,color=colors)
plt.xticks(ind, plt_tks)
plt.show()
dsr = DataSetReader(directory="../aclImdb/")
tr_data = dsr.labelled_string_data('train')
# Training Data split by label
tr_negative, tr_positive = tr_data[:len(tr_data)//2], tr_data[len(tr_data)//2:]
# small set of data of size 1000
tr_small = tr_negative[:500]+tr_positive[:500]
tst_data = dsr.labelled_string_data('test')
# Test Data split by label
tst_negative, tst_positive = tst_data[:len(tst_data)//2], tst_data[len(tst_data)//2:]
# small set of data of size 1000
tst_small = tst_negative[:500]+tst_positive[:500]
# Preprocessing combinations
cleaning_operations = ['remove_stopwords','lemmatize','stemmingLS','stemmingPS','stemmingSB'];
combinations = [i for j in range(len(cleaning_operations)) for i in itertools.combinations(cleaning_operations,j+1)]
# Vectorizers
vec_list = [['tfidf',{}],['count',{}],['wordembedd',{'min_count':1}],['fasttext',{'min_count':1}]]
# Classifiers
clf_list = [['RandomForestClassifier',{'n_estimators': [i for i in range(10,200,10)]}],
['KNN',{'n_neighbors':[i for i in range(1,8,2)]}],
['SVC',{'C':[0.1,10,100],'kernel':['rbf','poly']}],
['LogisticRegression',{'penalty':['l2'],'solver' : ['newton-cg', 'lbfgs', 'sag']}],
['DecisionTree',{'criterion':['gini','entropy']}]
]
# ex: {'KNN': [(acc, vectorization, preprocessing_ops)], 'LR': ... }
clf_dict = {clf[0]: {} for clf in clf_list}
for vec in vec_list:
print()
print('Vectorization: '+vec[0].__str__())
for combination in combinations:
print('Preprocessing: ' + combination.__str__())
tr_prp = PreProcess(tr_small)
tst_prp = PreProcess(tst_small)
tr_clean_data = tr_prp.clean(combination)
tst_clean_data = tst_prp.clean(combination)
vectorizer = Vectorizer(type=vec[0],fit_data=tr_clean_data,tst_data=tst_clean_data,params=vec[1])
tr_small_vecs, tst_small_vecs = vectorizer.vectorize()
for cl in clf_list:
print('Classifier: ' + cl[0].__str__())
clf = Classifier(cl[0], [d[1] for d in tr_small_vecs], [d[2] for d in tr_small_vecs])
params_accs = clf.tune(
cl[1],
[d[1] for d in tst_small_vecs],
[d[2] for d in tst_small_vecs],
max_only=False
)
print('Scores:'+params_accs.__str__())
# should have dict of clfs contatining dict of params with all accuracies & methods tried
for key, value in params_accs.items():
if key in clf_dict[cl[0]]:
clf_dict[cl[0]][key] += [(value, vec,combination)]
else:
clf_dict[cl[0]][key] = [(value, vec, combination)]
# Sorting accuracies dict
for key,value in clf_dict.items():
for k,v in value.items():
v = (sorted(v, key=lambda x: x[0], reverse=True))
clf_dict[key][k] = v
print(clf_dict)
# Plots
plot_clfs(width=0.15,max_num=3,acc_sep=1.2,param_sep=2.8)