-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDecisionTree.py
67 lines (54 loc) · 2.64 KB
/
DecisionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pandas as pd
import numpy as np
import time
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
# Here we have 2 sets of data that will remain constant during the whole training/testing process.
# Each dataset contains 5.000 samples (stratified) from the original dataset of Wednesday's traffic
# As we are targeting BENIGN traffic, BENIGN samples have label 1 and attacks have label 0
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
# Classifying BENIGNs as 1 and assigning labels to corresponding arrays
y_train = train['Label'].to_numpy()
y_test = test['Label'].to_numpy()
# Assigning the rest of the data to the datasets, converting values of features
# to float32 and to numpy arrays (sklearn by default uses 32 bits precision)
X_train = train[train.columns[0:-1]].astype(dtype=np.float32).to_numpy()
X_train = np.nan_to_num(X_train)
X_test = test[test.columns[0:-1]].astype(dtype=np.float32).to_numpy()
X_test = np.nan_to_num(X_test)
# Prepare arrays to store metrics, such as accuracies, recalls,
# precisions, f1_scores and running times for each tree
accuracies = []
recalls = []
precisions = []
f1_scores = []
running_time = []
# Prepare a dataframe to save and later compare results of running tree several times
df = {'Running time': running_time, 'Accuracy': accuracies, 'Recall': recalls,
'Precision': precisions, 'F1-score': f1_scores}
# To get the average metrics, such as accuracy, recalls, etc.
# We will run the classifier several times and then we will calculate
# averages based on what we obtain
for i in range(20):
tree = DecisionTreeClassifier()
start_time = time.time()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
end_time = time.time()
accuracies.append(accuracy_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
f1_scores.append(f1_score(y_test, y_pred))
running_time.append(end_time - start_time)
print("Mean accuracy: " + str(np.mean(accuracies)))
print("Mean precision: " + str(np.mean(precisions)))
print("Mean recalls: " + str(np.mean(recalls)))
print("Mean F1-scores: " + str(np.mean(f1_scores)))
print("Mean running time:" + str(np.mean(running_time)))
# Save our statistics to a dataframe (if we later need to export it to a more
# readable format, like an excel or csv file)
pd.DataFrame(df)
# As we see, the tree is, in fact, the best classifier for this problem. It has the fastest
# running time, the best metrics' values, like accuracy, precision and recalls,
# and doesn't require any preprocessing of data