-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparzen_windows_likelyhood(use KDtree and hypercube kernel).py
60 lines (50 loc) · 2.31 KB
/
parzen_windows_likelyhood(use KDtree and hypercube kernel).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
from sklearn.neighbors import KDTree
data = np.loadtxt('Diabetes.txt', delimiter=',')
accuracys = []
#Define parzen windows estimation function with hypercube kernel
def parzen_windows_likelyhood(train_data, h, observation, tree):
dimensions = len(observation)
#since |u| = |(x-x_i)/h| < 1/2 when window function = 1, so |x - x_i| < 1/2 * h
inside_index = tree.query_radius(observation, r= h/2)
k = len(inside_index[0])
return (float(k) / len(train_data)) / float(h**dimensions)
#use random training and testing datasets to run ten times
for trial in range(10):
#Re-order the dataset
data = np.random.permutation(data)
#Split train and test data sets
train_data = data[0:len(data)/2,:]
test_data = data[len(data)/2:,:]
#select features
active_feat = [1, 2, 3]
#calcuate the prior for each feature
prior1temp = float(len(train_data[train_data[:,8]==0, :]))
prior2temp = float(len(train_data[train_data[:,8]==1, :]))
prior1 = prior1temp/(prior1temp + prior2temp)
prior2 = prior2temp/(prior1temp + prior2temp)
#just select train dataset with features with class label
train_data_features1 = train_data[train_data[:,8]==0, :][:, active_feat]
train_data_features2 = train_data[train_data[:,8]==1, :][:, active_feat]
#select test dataset with features varibales
test_data_features = test_data[:, active_feat]
#train KDtree
#the defsult metric of distance is chebyshev distance which is |x - x_i|
tree1 = KDTree(train_data_features1, leaf_size=40, metric='chebyshev')
tree2 = KDTree(train_data_features2, leaf_size=40, metric='chebyshev')
correct = 0
wrong = 0
for i in range(len(test_data)):
post1 = parzen_windows_likelyhood(train_data_features1, 20, test_data_features[i], tree1) * prior1
post2 = parzen_windows_likelyhood(train_data_features2, 20, test_data_features[i], tree2) * prior2
if post1 > post2 and test_data[i, 8] == 0:
correct += 1
elif post1 < post2 and test_data[i, 8] == 1:
correct += 1
else:
wrong += 1
accuracy = float(correct) / len(test_data)
accuracys.append(accuracy)
print trial+1, accuracy
print 'Average accuracy:', np.mean(accuracys)
print 'std for accuracy:', np.std(accuracys)