-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhw2_main.py
70 lines (52 loc) · 2.31 KB
/
hw2_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from globals import *
import wrapper_method_tests as wrapper_tests
import filtrer_method_tests as filter_tests
import utils
def clean_func(): return wrapper_tests.clean_data(*clean_args.values())
# df to use. could be train/validation/test
df = df_train
features_info_dict = utils.get_features_info_dict(df)
# Configs
to_clean = True
clean_args = {'df': df_train, 'features_info_dict': features_info_dict, 'drop_features': False, 'negative_to_mean': False, 'labels_to_unique_ints': False,
'nominal_to_bool_split': True, 'missing_values_fill': True, 'binary_to_numeric': True, 'normalization': True}
to_print_corr_matrix = False # RUN WITHOUT CLEANING (except for making everything numeric)
correlation_thresholds, print_info = [0.8], False
to_test_accuracy = False
df_to_train, df_to_test, clean_accuracy_dfs = (df_train, df_validation, True)
to_test_feature_selection = True
feature_selection_method = filter_tests.relief
to_plot_two_features_together = False
x_feature, y_feature = ('Yearly_ExpensesK', 'Avg_Residancy_Altitude')
to_plot_feature_and_label = False
label_to_plot = 'Purples'
# --------------------- RUN -----------------------------
if to_clean:
# x = list(df.columns)
df = clean_func()
# y = list(df.columns)
# print([elem for elem in x if elem not in y])
# print(len(x)-len(y))
if to_print_corr_matrix:
for threshold in correlation_thresholds:
filter_tests.find_correlated_features(df=df, correlation_threshold=threshold)
if to_test_accuracy:
clean_args['df'] = df_to_train
cleaned_train = clean_func()
clean_args['df'] = df_to_test
cleaned_val = clean_func()
clean_args['df'] = df
missing_cols = [col for col in cleaned_train.columns if col not in cleaned_val]
cleaned_train.drop(columns=missing_cols)
wrapper_tests.test_with_random_forest(cleaned_train, cleaned_val)
if to_test_feature_selection:
res = feature_selection_method(df=df)
print('{} features sorted by importance:'.format(len(res)))
for elem in res:
print(elem)
if to_plot_two_features_together:
import matplotlib.pyplot
matplotlib.pyplot.plot(df[x_feature], df[y_feature], 'ro')
utils.plt.show()
if to_plot_feature_and_label:
utils.plot_label_and_examples(label_to_plot, df)