-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsvm.py
123 lines (97 loc) · 5.01 KB
/
svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from matplotlib import pyplot as plt
import numpy as np
from sklearn.inspection import permutation_importance
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
def calculate_feature_importance(csv_file_healthy, csv_file_swollen, feature_prefix, n_splits=10):
# Load datasets
df_swollen = pd.read_csv(csv_file_swollen)
df_healthy = pd.read_csv(csv_file_healthy)
# Drop the 'Image' column to retain only numerical values
df_swollen.drop('Image', axis=1, inplace=True)
df_healthy.drop('Image', axis=1, inplace=True)
# Add a label column to each dataset
df_swollen['label'] = 1 # Class 1 (Swollen)
df_healthy['label'] = 0 # Class 0 (Healthy)
# Combine the datasets
df_combined = pd.concat([df_swollen, df_healthy])
# Separate features and labels
X = df_combined.iloc[:, :-1] # Features
y = df_combined['label'] # Labels
# Create a StratifiedKFold object to maintain the proportion of each class
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
# Create an SVM classifier
clf = svm.SVC(kernel='linear', class_weight='balanced', random_state=42)
# Lists to store metrics
accuracy_scores = []
f1_scores = []
recall_scores = []
precision_scores = []
confusion_matrices = []
# List to store feature importances from each fold
feature_importances = np.zeros(X.shape[1])
# Perform k-fold cross-validation and calculate metrics
for train_index, test_index in skf.split(X, y):
X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
# Resampling the dataset using SMOTE
smote = SMOTE(random_state=42)
X_train_fold_resampled, y_train_fold_resampled = smote.fit_resample(X_train_fold, y_train_fold)
# Fit and predict using the resampled dataset
clf.fit(X_train_fold_resampled, y_train_fold_resampled)
y_pred = clf.predict(X_test_fold)
# Metrics calculation
report = classification_report(y_test_fold, y_pred, output_dict=True, zero_division=0)
accuracy_scores.append(accuracy_score(y_test_fold, y_pred))
f1_scores.append(report['weighted avg']['f1-score'])
recall_scores.append(report['weighted avg']['recall'])
precision_scores.append(report['weighted avg']['precision'])
confusion_matrices.append(confusion_matrix(y_test_fold, y_pred))
# Permutation importance
perm_importance = permutation_importance(clf, X_test_fold, y_test_fold, n_repeats=10, random_state=42)
# Accumulate the feature importances
feature_importances += perm_importance.importances_mean
# Average the feature importances over all folds
feature_importances /= n_splits
# Sort the features by importance
sorted_indices = np.argsort(feature_importances)[::-1]
# Plot the feature importances
plt.figure(figsize=(12, 8))
plt.title(f"Permutation Feature Importance with KFold for {feature_prefix}")
plt.bar(range(len(feature_importances)), feature_importances[sorted_indices], align="center")
plt.xticks(range(len(feature_importances)), X.columns[sorted_indices], rotation=90)
plt.ylabel("Mean accuracy decrease across folds")
plt.tight_layout()
plt.savefig(f'results/plots/{feature_prefix}_kfold_feature_importance.png')
# Print the sorted features and their importance
print(f"Feature importances with KFold for {feature_prefix}:")
for idx in sorted_indices:
print(f"{X.columns[idx]}: {feature_importances[idx]:.4f}")
# Average metrics
mean_accuracy = np.mean(accuracy_scores)
mean_f1_score = np.mean(f1_scores)
mean_recall = np.mean(recall_scores)
mean_precision = np.mean(precision_scores)
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Mean F1 Score: {mean_f1_score:.2f}")
print(f"Mean Recall: {mean_recall:.2f}")
print(f"Mean Precision: {mean_precision:.2f}")
# Calculating the averaged confusion matrix
mean_confusion_matrix = np.mean(confusion_matrices, axis=0)
# Plotting the averaged confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(mean_confusion_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=['Healthy', 'Swollen'], yticklabels=['Healthy', 'Swollen'])
plt.title('Average Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.savefig(f'results/plots/{feature_prefix}_average_confusion_matrix.png')
# Call the function for PIP features
calculate_feature_importance('results/features_pip_healthy.csv', 'results/features_pip_swollen.csv', 'PIP')
# Call the function for DIP features
calculate_feature_importance('results/features_dip_healthy.csv', 'results/features_dip_swollen.csv', 'DIP')