-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_model.py
110 lines (89 loc) · 3.44 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
This is a speech recognition project for the Universidade Federal do ABC.
"""
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
def main() -> None:
"""Executes the program.
Returns
-------
None
Description of returned object.
"""
# Load training data
with open("./train.csv") as data_file:
train = pd.read_csv(data_file)
# Load validation data
with open("./valid.csv") as data_file:
valid = pd.read_csv(data_file)
# Train a random forest
# !!!!The number os estimators is not optimized!!!!!
clf = RandomForestClassifier(n_jobs=-1, n_estimators=1000)
# Return the most valuables attributes
selected = SelectFromModel(clf, threshold="1.25*median")
# Factorize the classes
fact, char = pd.factorize(train.iloc[:, -1], sort=True)
# Train with every attribute
print("Fitting...")
selected.fit(train.iloc[:, :-1], fact)
# Train with the most relevant attributes
print("Refitting...")
clf.fit(train.iloc[:, :-1].loc[:, selected.get_support()], fact)
"""
# Cross validation
param_grid = [{"n_estimators": [100, 300, 500, 800, 1200],
"max_depth": [5, 8, 15, 25, 30],
"min_samples_split": [2, 5, 10, 15, 100],
"min_samples_leaf": [1, 2, 5, 10]}
]
grid_search = GridSearchCV(clf, param_grid, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(train.iloc[:, :-1], fact)
print("Best params:")
print()
print(grid_search.best_params_)
print()
print("accuracy:")
print()
means = grid_search.cv_results_["mean_test_score"]
stds = grid_search.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds,
grid_search.cv_results_["params"]):
print("\t%0.3f (+/-%0.03f) :: %r" % (mean, std * 2, params))
print()
# Apply the learned model
pred = grid_search.best_estimator_.predict(valid.iloc[:, :-1])
"""
pred = clf.predict(valid.iloc[:, :-1].loc[:, selected.get_support()])
matrix = pd.crosstab(valid.iloc[:, -1], char[pred],
rownames=["Actual Char"], colnames=["Predicted Char"])
print(matrix)
print(accuracy_score(valid.iloc[:, -1], char[pred]))
# Train the a model with all the data
bigdata = train.append(valid, ignore_index=True)
fact, char = pd.factorize(bigdata.iloc[:, -1], sort=True)
print("Fitting the final model...")
clf.fit(bigdata.iloc[:, :-1].loc[:, selected.get_support()], fact)
# Save the final model
print("Saving the model...")
with open("./final_model.pkl", "wb") as model_file:
joblib.dump(clf, model_file)
with open("./features.pkl", "wb") as features_file:
joblib.dump(selected.get_support(), features_file)
ax = sns.heatmap(matrix, annot=True, cbar=False, square=True, fmt="d")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.title("Confusion Matrix")
with open("./graphs/confusion_matrix.png", "wb") as graph_file:
plt.savefig(graph_file)
# TODO REMOVE BACKGROUND NOISE
# TODO FILTER FOR VOCAL WAVE LEGHT (MADE IT WORSE)
# TODO TRIM SILENCE
if __name__ == "__main__":
main()