-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknn.py
210 lines (160 loc) · 4.69 KB
/
knn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# load data
wine=datasets.load_wine()
#print(wine.DESCR)
# this dataset has 13 features, we will only choose a subset of these
df_wine = pd.DataFrame(wine.data, columns = wine.feature_names )
selected_features = ['alcohol','flavanoids','color_intensity','ash']
# extract the data as numpy arrays of features, X, and target, y
X = df_wine[selected_features].values
y = wine.target
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Preprocess data
X = StandardScaler().fit_transform(X)
# Split data into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
import numpy as np
from collections import Counter
class KNN():
"""
K-nearest neighbour classifier
Attributes: ?
- slope (float): Slope of the regression line.
- intercept (float): Intercept of the regression line.
Methods:
- fit(X, y) : Fit the model to input data.
- euclidean() :
- manhattan() :
- neighbours(x) :
- predict(X) : Predict target values for new data.
"""
def __init__(self, num_neighbors=3, distance='euclidean'):
"""Inits KNN
Args:
num_neighbours: k-number of neighbours
distance: distance metric used, 'euclidean' 'manhattan'
Returns
------
- None
"""
#initialise neighbours
self.num_neighbors = num_neighbors
#initialise distance
self.distance = distance
def fit(self, X, y):
"""
Fits split dataset
Params
----------
- X : array
- y : array
Returns
--------
- None
"""
#Fit X
self.X_train = X
#Fit y
self.y_train = y
def euclidean(self, v1, v2):
"""
Calculates euclidean distance between two vector points
Params
------
- v1 :
- v2 :
Returns
------
- float : distance
"""
#Calculate euclidean distance ()
distance = np.sqrt(np.sum((v1 - v2)**2))
return distance
def manhattan(self, v1, v2):
"""
Calculates manhattan distance between two vector points
Params
------
- v1 :
- v2 :
Returns
------
- float : distance
"""
#Calculate manhattan distance ()
diff = v1 - v2
abs_diff = np.abs(diff)
distance = np.sum(abs_diff)
return distance
def neighbors(self, x):
"""
Sorts the neighbors according to the distance function
Params
------
- x :
Returns
------
- list : sorted_neighbors
"""
#Empty distances list
distances = []
#Iterate through X_train set, choose distance, calculate distances on each ... , append distances to list
for x_train in self.X_train:
if self.distance == 'euclidean':
dist = self.euclidean(x, x_train)
distances.append(dist)
elif self.distance == 'manhattan':
dist = self.manhattan(x, x_train)
distances.append(dist)
#Sort distances
sorted_indices = np.argsort(distances)
#Sort according to num_neighbours
sorted_neighbors = [self.y_train[i] for i in sorted_indices[:self.num_neighbors]]
return sorted_neighbors
def predict(self, X):
"""
Predicts the class neigbour belongs to
Params
---------
- X :
Returns
---------
- array : predictions
"""
#Empty predictions list
predictions = []
#Loop through neighbours, return most common value on each index
for x in X:
neighbors = self.neighbors(x)
neighbor_counts = Counter(neighbors)
top = neighbor_counts.most_common(1)[0][0]
predictions.append(top)
return np.array(predictions)
def accuracy(x, y):
"""
Calculates accuracy
Params
------
- x : array
- y : array
Returns
------
- float : accuracy
"""
# Array length
total = len(x)
# Init counter
counter = 0
# count values in prediction that matches test set
for i in range(total):
if x[i] == y[i]:
counter += 1
# number of correct counts over length of array
accuracy = counter / total
return accuracy