-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
480 lines (414 loc) · 16.1 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
import pandas as pd
import time
import os
from utils import load_datasets, get_process_time, create_directory
from preprocessing.preprocessing import (
process_folds,
changecoding,
rename_class_column,
identify_binary_variables,
identify_categorical_variables,
scale_folds,
)
from reporting.stats_friedman import (
run_friedman_test,
get_top_configurations,
analyze_voting_schemes_knn,
analyze_weighting_methods_knn,
analyze_k_values_knn,
analyze_svm_kernels,
analyze_svm_c_values,
analyze_svm_gamma_values,
analyze_reduction_techniques,
)
from classifiers.knn import cross_validate_kNN_weighted_parallel
from classifiers.svm import cross_validate_svm
from reduction_techniques.wilson_th import evaluate_reduced_wilsonth_knn
from reduction_techniques.gcnn import evaluate_reduced_gcnn_knn_parallel
from reduction_techniques.drop_3 import evaluate_reduced_drop3_knn_parallel
from reporting.report import (
aggregate_results_by_x,
create_performance_report,
create_nemenyi_report,
knn_create_reduction_comparison_report,
svm_create_reduction_comparison_report,
create_class_distribution_report,
create_class_distribution_report,
create_hyperparameter_cd_diagram,
)
if __name__ == "__main__":
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, "csv_results")
report_dir = os.path.join(script_dir, "reports")
dataset_name = (
input("Enter the name of the dataset ['hepatitis', 'pen-based']: ")
or "hepatitis"
)
directory_path = os.path.join(script_dir, "datasetsCBR", dataset_name)
# track time
start_time = time.time()
# Load datasets
train_dfs, test_dfs, train_and_test_dfs = load_datasets(
directory_path, dataset_name
)
print(
f"Loaded {len(train_dfs)} training datasets and {len(test_dfs)} test datasets"
)
# Change coding of datasets
train_dfs = [changecoding(df) for df in train_dfs]
test_dfs = [changecoding(df) for df in test_dfs]
# Rename the last variable to class if it iis not called class/Class
train_dfs = [rename_class_column(df) for df in train_dfs]
test_dfs = [rename_class_column(df) for df in test_dfs]
train_and_test_dfs = [changecoding(df) for df in train_and_test_dfs]
train_and_test_dfs = [rename_class_column(df) for df in train_and_test_dfs]
print("Changed coding of datasets")
# Combine all training data to identify variables
all_train = pd.concat(train_dfs, ignore_index=True)
binary_vars = identify_binary_variables(all_train)
categorical_variables = identify_categorical_variables(all_train, binary_vars)
print("These are the categorical variables:", categorical_variables)
continuous_vars = list(
set(all_train.columns) - set(binary_vars) - set(categorical_variables)
)
print(
f"Identified {len(binary_vars)} binary variables and {len(continuous_vars)} continuous variables and {len(categorical_variables)} "
)
# Process each fold
train_folds, test_folds = process_folds(
train_dfs, test_dfs, binary_vars, continuous_vars, categorical_variables
)
# After imputation, let us check the remaining missing values
for i in range(10):
missing_after = train_folds[i].isnull().sum() + test_folds[i].isnull().sum()
value = missing_after[missing_after > 0]
if value.empty:
print(f"All fine! No missing values found in train and test for fold {i}")
else:
print(f"Missing values found in fold {i}:\n{value}")
create_directory(report_dir, "")
create_class_distribution_report(
train_and_test_dfs[0], os.path.join(report_dir, "class_distribution.pdf")
)
print("Processed folds")
print("Scaling folds...")
train_folds, test_folds = scale_folds(
train_folds,
test_folds,
)
print("Scaled folds")
train_folds = [df.to_numpy() for df in train_folds]
test_folds = [df.to_numpy() for df in test_folds]
# preprocessing time
preprocessing_time = get_process_time("Preprocessing", start_time)
# Creating the API for the kNN
# add potential methods for voting and distance
voting_methods = ["majority", "inverse_distance", "sheppard"]
distance_methods = ["minkowski", "clark"]
# dictionaries to abbreviate the results
voting_method_abbr = {
"majority": "MAJR",
"inverse_distance": "INVD",
"sheppard": "SHEP",
}
distance_method_abbr = {"minkowski": "MIN", "clark": "CLK"}
k_values = [1, 3, 5, 7] # Neighbors
p_values = [
1,
2,
3,
] # 1 for Manhattan, 2 for Euclidean, 3 for cubic distance (for the Minkowski- Clark is fixed 2 )
feature_weighting_methods = ["equal", "information_gain", "reliefF"]
knn_results = cross_validate_kNN_weighted_parallel(
train_folds,
test_folds,
distance_methods,
k_values,
p_values,
voting_methods,
feature_weighting_methods,
output_dir=output_dir,
)
aggregate_results_by_x(knn_results, "voting_method", output_dir)
aggregate_results_by_x(knn_results, "w_method", output_dir)
# Create voting_sceme related summary tables figures.
voting_analysis = analyze_voting_schemes_knn(knn_results, report_dir)
# Create weighting_method related summary tables figures.
weighting_method_analysis = analyze_weighting_methods_knn(knn_results, report_dir)
# Create k value related summary tables figures.
K_values_analysis = analyze_k_values_knn(knn_results, report_dir)
knn_time = get_process_time("KNN took", preprocessing_time)
# Create CD plots for Weighting methods
create_hyperparameter_cd_diagram(
knn_results,
method_key="w_method",
methods_list=["equal", "information_gain", "reliefF"],
method_name="Feature Weighting Methods",
report_dir=report_dir,
)
# Create CD plots for K values
create_hyperparameter_cd_diagram(
knn_results,
method_key="k", # This should match the key in your results dictionary
methods_list=["1", "3", "5", "7"], # Convert to strings
method_name="K Values",
report_dir=report_dir,
)
# Create CD plots for voting schemes
create_hyperparameter_cd_diagram(
knn_results,
method_key="voting_method", # This matches the key in your results dictionary
methods_list=["majority", "inverse_distance", "sheppard"],
method_name="Voting Schemes",
report_dir=report_dir,
)
# Get standardized/top 3 KNN results using the get_top_configurations_knn function:
best_knn = get_top_configurations(knn_results, algorithm="KNN", n_top=3)
# Wilson Threshold reduction :
wilson_start_time = time.time()
wilsonth_results, wilsonth_reduced_folds_wilson = evaluate_reduced_wilsonth_knn(
train_folds,
test_folds,
distance_methods=["minkowski", "clark"],
k_values=[1, 3, 5, 7],
p_values=[1, 2, 3],
voting_methods=["majority", "inverse_distance", "sheppard"],
weighting_methods=feature_weighting_methods,
output_dir=output_dir,
thresholds=[0.4, 0.5, 0.6],
)
best_wilson = get_top_configurations(wilsonth_results, algorithm="WILSON", n_top=3)
wilson_total_time = time.time() - wilson_start_time
# GCNN reduction
gcnn_start_time = time.time()
GCNN_results, GCNN_reduced_folds = evaluate_reduced_gcnn_knn_parallel(
train_folds,
test_folds,
distance_methods=["minkowski", "clark"],
k_values=[1, 3, 5, 7],
p_values=[1, 2, 3],
voting_methods=["majority", "inverse_distance", "sheppard"],
weighting_methods=feature_weighting_methods,
output_dir=output_dir,
alpha_values=[0.85, 0.90, 0.95],
)
best_GCNN = get_top_configurations(GCNN_results, algorithm="GCNN", n_top=3)
gcnn_total_time = time.time() - gcnn_start_time
# # Drop 3 reduction
drop3_start_time = time.time()
DROP3_results, DROP3_reduced_folds = evaluate_reduced_drop3_knn_parallel(
train_folds,
test_folds,
distance_methods=["minkowski", "clark"],
k_values=[1, 3, 5, 7],
p_values=[1, 2, 3],
voting_methods=["majority", "inverse_distance", "sheppard"],
weighting_methods=feature_weighting_methods,
output_dir=output_dir,
)
best_DROP3 = get_top_configurations(DROP3_results, algorithm="DROP3", n_top=3)
drop3_total_time = time.time() - drop3_start_time
# Compare Reduction algorithms
reduction_analysis = analyze_reduction_techniques(
wilsonth_results, GCNN_results, DROP3_results, report_dir
)
# SVM part
SVM_result = cross_validate_svm(
train_folds,
test_folds,
output_dir,
c_values=[0.1, 1, 10, 100, 1000],
gamma_values=[1, 0.1, 0.01, 0.001, 0.0001],
kernels=["linear", "poly", "rbf", "sigmoid"],
)
# create CD plot for:
# For SVM kernels
create_hyperparameter_cd_diagram(
SVM_result,
method_key="kernel",
methods_list=["linear", "poly", "rbf", "sigmoid"],
method_name="SVM Kernels",
report_dir=report_dir,
)
# For C values
create_hyperparameter_cd_diagram(
SVM_result,
method_key="C",
methods_list=["0.1", "1", "10", "100", "1000"], # as strings
method_name="SVM C Values",
report_dir=report_dir,
)
# For gamma values
create_hyperparameter_cd_diagram(
SVM_result,
method_key="gamma",
methods_list=["1", "0.1", "0.01", "0.001", "0.0001"], # as strings
method_name="SVM Gamma Values",
report_dir=report_dir,
)
aggregate_results_by_x(SVM_result, "C", output_dir)
aggregate_results_by_x(SVM_result, "gamma", output_dir)
aggregate_results_by_x(SVM_result, "kernel", output_dir)
best_SVM = get_top_configurations(SVM_result, algorithm="SVM", n_top=3)
# Compare kernels using Friedman test
kernel_analysis = analyze_svm_kernels(SVM_result, report_dir)
# Compare C values using Friedman test
c_analysis = analyze_svm_c_values(SVM_result, report_dir)
# Compare gamma parameters using Friedman test
gamma_analysis = analyze_svm_gamma_values(SVM_result, report_dir)
# Utilize the dataset which is provided by Wilson TH algorithm
best_wilson_config = best_wilson[0][
"configuration"
] # First configuration from top 3
k_best = best_wilson_config["k"]
threshold_best = best_wilson_config["threshold"]
best_reduced_folds = wilsonth_reduced_folds_wilson[(k_best, threshold_best)]
SVM_result_Wilson = cross_validate_svm(
best_reduced_folds,
test_folds,
output_dir,
c_values=[0.1, 1, 10, 100, 1000],
gamma_values=[1, 0.1, 0.01, 0.001, 0.0001],
kernels=["linear", "poly", "rbf", "sigmoid"],
)
best_SVM_Wilson = get_top_configurations(
SVM_result_Wilson, algorithm="SVM", n_top=3
)
# Update each result with Wilson information using shorter parameter names
for result in best_SVM_Wilson:
result["configuration"].update(
{
"reduction_method": "Wilson",
"wil_k": k_best,
"wil_th": threshold_best,
"wil_p": best_wilson_config["p"],
"wil_v": best_wilson_config["V"],
"wil_d": best_wilson_config["D"],
"wil_w": best_wilson_config["W"],
}
)
# Update method name to reflect it's using Wilson reduction
result["method"] = "Wilson_" + result["method"]
# Get best GCNN configuration
best_GCNN_config = best_GCNN[0]["configuration"] # First configuration from top 3
k_best = best_GCNN_config["k"]
alpha_best = best_GCNN_config["alpha"]
best_reduced_folds = GCNN_reduced_folds[alpha_best] # Use alpha as the key
# Run SVM on GCNN-reduced data
SVM_result_GCNN = cross_validate_svm(
best_reduced_folds,
test_folds,
output_dir,
c_values=[0.1, 1, 10, 100, 1000],
gamma_values=[1, 0.1, 0.01, 0.001, 0.0001],
kernels=["linear", "poly", "rbf", "sigmoid"],
)
# Get best SVM configurations
best_SVM_GCNN = get_top_configurations(SVM_result_GCNN, algorithm="SVM", n_top=3)
# Extract GCNN parameters from best configuration
gcnn_k_best = best_GCNN_config["k"]
gcnn_p_best = best_GCNN_config["p"]
gcnn_voting_method_best = best_GCNN_config["V"]
gcnn_weighting_method_best = best_GCNN_config["W"]
gcnn_distance_method_best = best_GCNN_config["D"]
gcnn_alpha_best = best_GCNN_config["alpha"]
# # Update each SVM result with GCNN information
for result in best_SVM_GCNN: # Note: changed from best_gcnn_config to best_SVM_GCNN
result["configuration"].update(
{
"reduction_method": "GCNN",
"gcnn_k": gcnn_k_best,
"gcnn_p": gcnn_p_best,
"gcnn_v": gcnn_voting_method_best,
"gcnn_d": gcnn_distance_method_best,
"gcnn_w": gcnn_weighting_method_best,
"gcnn_a": gcnn_alpha_best,
}
)
# Update method name to reflect it's using GCNN reduction
result["method"] = "GCNN_" + result["method"]
best_DROP3_config = best_DROP3[0]["configuration"]
k_best = best_DROP3_config["k"]
best_reduced_folds = DROP3_reduced_folds[k_best]
# Run SVM on DROP3-reduced data
SVM_result_DROP3 = cross_validate_svm(
best_reduced_folds,
test_folds,
output_dir,
c_values=[0.1, 1, 10, 100, 1000],
gamma_values=[1, 0.1, 0.01, 0.001, 0.0001],
kernels=["linear", "poly", "rbf", "sigmoid"],
)
# Get best SVM configurations
best_SVM_DROP3 = get_top_configurations(SVM_result_DROP3, algorithm="SVM", n_top=3)
# Extract DROP3 parameters from best configuration
drop3_k_best = best_DROP3_config["k"]
drop3_p_best = best_DROP3_config["p"]
drop3_voting_method_best = best_DROP3_config["V"]
drop3_weighting_method_best = best_DROP3_config["W"]
drop3_distance_method_best = best_DROP3_config["D"]
# Update each SVM result with DROP3 information
for result in best_SVM_DROP3:
result["configuration"].update(
{
"reduction_method": "DROP3",
"drop3_k": drop3_k_best,
"drop3_p": drop3_p_best,
"drop3_v": drop3_voting_method_best,
"drop3_d": drop3_distance_method_best,
"drop3_w": drop3_weighting_method_best,
}
)
# Update method name to reflect it's using DROP3 reduction
result["method"] = "DROP3_" + result["method"]
# Combine results
all_results = (
best_knn
+ best_wilson
+ best_SVM
+ best_GCNN
+ best_DROP3
+ best_SVM_Wilson
+ best_SVM_GCNN
+ best_SVM_DROP3
)
# # Run Friedman test
friedman_results = run_friedman_test(all_results)
knn_create_reduction_comparison_report(
best_knn,
best_wilson,
best_GCNN,
best_DROP3,
train_folds,
wilsonth_reduced_folds_wilson,
GCNN_reduced_folds,
DROP3_reduced_folds,
wilson_total_time,
gcnn_total_time,
drop3_total_time,
filename=os.path.join(report_dir, "reduction_comparison_report.pdf"),
)
svm_create_reduction_comparison_report(
best_SVM,
best_SVM_Wilson,
best_SVM_GCNN,
best_SVM_DROP3,
train_folds,
wilsonth_reduced_folds_wilson,
GCNN_reduced_folds,
DROP3_reduced_folds,
wilson_total_time,
gcnn_total_time,
drop3_total_time,
filename=os.path.join(report_dir, "reduction_comparison_report_svm.pdf"),
)
# # Create report with our pdf generator function.
create_performance_report(
friedman_results["sorted_results"],
filename=os.path.join(report_dir, "model_performance_report.pdf"),
)
# # Create Nemenyi summary table with our function.
create_nemenyi_report(
friedman_results,
filename=os.path.join(report_dir, "nemenyi_comparison_report.pdf"),
)