-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathminas.py
460 lines (380 loc) · 17.9 KB
/
minas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
# from celluloid import Camera
from skmultiflow.core import BaseSKMObject, ClassifierMixin
from sklearn.cluster import KMeans
class Minas(BaseSKMObject, ClassifierMixin):
def __init__(self,
kini=3,
cluster_algorithm='kmeans',
random_state=0,
min_short_mem_trigger=10,
min_examples_cluster=10,
threshold_strategy=1,
threshold_factor=1.1,
window_size=100,
update_summary=False,
animation=False):
super().__init__()
self.kini = kini
self.random_state = random_state
accepted_algos = ['kmeans'] # TODO: define list of accepted algos
if cluster_algorithm not in accepted_algos:
print('Available algorithms: {}'.format(', '.join(accepted_algos)))
else:
self.cluster_algorithm = cluster_algorithm
self.microclusters = [] # list of microclusters
self.before_offline_phase = True
self.short_mem = []
self.sleep_mem = []
self.min_short_mem_trigger = min_short_mem_trigger
self.min_examples_cluster = min_examples_cluster
self.threshold_strategy = threshold_strategy
self.threshold_factor = threshold_factor
self.window_size = window_size
self.update_summary = update_summary
self.animation = animation
self.sample_counter = 0 # to be used with window_size
if self.animation:
# TODO use Camera
# self.fig = plt.figure()
# self.camera = Camera(self.fig)
self.animation_frame_num = 0
def fit(self, X, y, classes=None, sample_weight=None):
"""fit means fitting in the OFFLINE phase"""
self.microclusters = self.offline(X, y)
self.before_offline_phase = False
return self
def partial_fit(self, X, y, classes=None, sample_weight=None):
self.sample_counter += 1
if self.before_offline_phase:
self.fit(X, y)
else:
y_preds, cluster_preds = self.predict(X, ret_cluster=True)
timestamp = self.sample_counter
# TODO: remove this ugly loop too
for point_x, y_pred, cluster in zip(X, y_preds, cluster_preds):
if y_pred != -1: # the model can explain point_x
cluster.update_cluster(point_x, y_pred, timestamp, self.update_summary)
else: # the model cannot explain point_x
self.short_mem.append(ShortMemInstance(point_x, timestamp))
if len(self.short_mem) >= self.min_short_mem_trigger:
self.novelty_detect()
if self.animation:
self.plot_clusters()
# forgetting mechanism
if self.sample_counter % self.window_size == 0:
self.trigger_forget()
return self
def predict(self, X, ret_cluster=False):
"""X is an array"""
# TODO: remove this ugly loop
pred_labels = []
pred_clusters = []
for point in X:
# find closest centroid
closest_cluster = min(self.microclusters,
key=lambda cl: cl.distance_to_centroid(point))
if closest_cluster.encompasses(point): # classify in this cluster
pred_labels.append(closest_cluster.label)
pred_clusters.append(closest_cluster)
else: # classify as unknown
pred_labels.append(-1)
pred_clusters.append(None)
if ret_cluster:
return np.asarray(pred_labels), pred_clusters
else:
return np.asarray(pred_labels)
def predict_proba(self, X):
# TODO
pass
def offline(self, X_train, y_train):
microclusters = []
# in offline phase, consider all instances arriving at the same time in the microclusters:
timestamp = len(X_train)
if self.cluster_algorithm == 'kmeans':
for y_class in np.unique(y_train):
# subset with instances from each class
X_class = X_train[y_train == y_class]
class_cluster_clf = KMeans(n_clusters=self.kini,
random_state=self.random_state)
class_cluster_clf.fit(X_class)
for class_cluster in np.unique(class_cluster_clf.labels_):
# get instances in cluster
cluster_instances = X_class[class_cluster_clf.labels_ == class_cluster]
microclusters.append(
MicroCluster(y_class, cluster_instances, timestamp)
)
return microclusters
def novelty_detect(self):
possible_clusters = []
X = np.array([instance.point for instance in self.short_mem])
if self.cluster_algorithm == 'kmeans':
cluster_clf = KMeans(n_clusters=self.kini,
random_state=self.random_state)
cluster_clf.fit(X)
for cluster_label in np.unique(cluster_clf.labels_):
cluster_instances = X[cluster_clf.labels_ == cluster_label]
possible_clusters.append(
MicroCluster(-1, cluster_instances, self.sample_counter))
for cluster in possible_clusters:
if cluster.is_cohesive(self.microclusters) and cluster.is_representative(self.min_examples_cluster):
closest_cluster = cluster.find_closest_cluster(self.microclusters)
closest_distance = cluster.distance_to_centroid(closest_cluster.centroid)
threshold = self.best_threshold(cluster, closest_cluster,
self.threshold_strategy, self.threshold_factor)
# TODO make these ifs elifs cleaner
if closest_distance < threshold: # the new microcluster is an extension
cluster.label = closest_cluster.label
elif self.sleep_mem: # look in the sleep memory, if not empty
closest_cluster = cluster.find_closest_cluster(self.sleep_mem)
closest_distance = cluster.distance_to_centroid(closest_cluster.centroid)
if closest_distance < threshold: # check again: the new microcluster is an extension
cluster.label = closest_cluster.label
# awake old cluster
self.sleep_mem.remove(closest_cluster)
closest_cluster.timestamp = self.sample_counter
self.microclusters.append(closest_cluster)
else: # the new microcluster is a novelty pattern
cluster.label = max([cluster.label for cluster in self.microclusters]) + 1
else: # the new microcluster is a novelty pattern
cluster.label = max([cluster.label for cluster in self.microclusters]) + 1
# add the new cluster to the model
self.microclusters.append(cluster)
# remove these examples from short term memory
for instance in cluster.instances:
self.short_mem.remove(instance)
def best_threshold(self, new_cluster, closest_cluster, strategy, factor):
def run_strategy_1():
factor_1 = factor
# factor_1 = 5 # good for artificial, separated data sets
return factor_1 * np.std(closest_cluster.distance_to_centroid(closest_cluster.instances))
if strategy == 1:
return run_strategy_1()
else:
# factor_2 = factor_3 = 1.2 # good for artificial, separated data sets
factor_2 = factor_3 = factor
clusters_same_class = self.get_clusters_in_class(closest_cluster.label)
if len(clusters_same_class) == 1:
return run_strategy_1()
else:
class_centroids = np.array([cluster.centroid for cluster in clusters_same_class])
distances = closest_cluster.distance_to_centroid(class_centroids)
if strategy == 2:
return factor_2 * np.max(distances)
elif strategy == 3:
return factor_3 * np.mean(distances)
def get_clusters_in_class(self, label):
return [cluster for cluster in self.microclusters if cluster.label == label]
def trigger_forget(self):
for cluster in self.microclusters:
if cluster.timestamp < self.sample_counter - self.window_size:
self.sleep_mem.append(cluster)
self.microclusters.remove(cluster)
for instance in self.short_mem:
if instance.timestamp < self.sample_counter - self.window_size:
self.short_mem.remove(instance)
def plot_clusters(self):
"""Simplistic plotting, assumes elements in cluster have two dimensions"""
points = pd.DataFrame(columns=['x', 'y', 'pred_label'])
cluster_info = pd.DataFrame(columns=['label', 'centroid', 'radius'])
for cluster in self.microclusters:
cluster_info = cluster_info.append(pd.Series({'label': cluster.label,
'centroid': cluster.centroid,
'radius': cluster.radius}),
ignore_index=True)
# add points from cluster
for point in cluster.instances:
points = points.append(pd.Series({'x': point[0],
'y': point[1],
'pred_label': cluster.label}),
ignore_index=True)
# add points from short term memory
for mem_instance in self.short_mem:
points = points.append(pd.Series({'x': mem_instance.point[0],
'y': mem_instance.point[1],
'pred_label': -1}),
ignore_index=True)
color_names = ['k', 'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple',
'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
assert len(cluster_info.label.unique()) <= len(color_names) # limited to these colors for now
# colormap to be indexed by label id
colormap = pd.DataFrame({'name': color_names}, index=range(-1, len(color_names) - 1))
mapped_label_colors = colormap.loc[points['pred_label']].values[:, 0]
plt.scatter(points['x'], points['y'], c=mapped_label_colors, s=10, alpha=0.3)
plt.gca().set_aspect('equal', adjustable='box') # equal scale for both axes
circles = []
for label, centroid, radius in cluster_info.values:
circles.append(plt.Circle((centroid[0], centroid[1]), radius,
color=colormap.loc[label].values[0], alpha=0.1))
for circle in circles:
plt.gcf().gca().add_artist(circle)
# self.camera.snap()
import os
if not os.path.exists('animation'):
os.makedirs('animation')
plt.savefig(f'animation/clusters_{self.animation_frame_num:05}.png', dpi=300)
plt.close()
self.animation_frame_num += 1
def plot_animation(self):
pass
# TODO
# animation = self.camera.animate()
# animation.save('animation.mp4')
def confusion_matrix(self, X_test=None, y_test=None):
"""Creates a confusion matrix.
It must be run on a fitted classifier that has already seen the examples in the test set.
Parameters
----------
X_test : numpy.ndarray
The set of data samples to predict the class labels for.
y_test : numpy.ndarray
The set of class labels for the data samples.
Returns
-------
pandas.core.frame.DataFrame
"""
# Init confusion matrix
y_test_classes = np.unique(y_test) # rows
detected_classes = np.unique([cluster.label for cluster in self.microclusters]) # columns
conf_matrix = pd.DataFrame(np.zeros((len(y_test_classes), len(detected_classes) + 1), dtype=np.int32))
conf_matrix.index = y_test_classes
conf_matrix.rename(columns={len(detected_classes): -1}, inplace=True) # last column is for unknown class
y_preds = self.predict(X_test)
for y_true, y_pred in zip(y_test, y_preds):
conf_matrix.loc[y_true, y_pred] += 1
return conf_matrix
class MicroCluster(object):
def __init__(self,
label, # the class the microcluster belongs to
instances,
timestamp=0):
# TODO: remove instances so it doesn't need to be stored in memory
super(MicroCluster, self).__init__()
self.label = label
self.instances = instances
self.n = len(instances)
self.linear_sum = instances.sum(axis=0)
self.centroid = self.linear_sum / self.n
# Sum of the squared l2 norms of all samples belonging to a microcluster:
self.squared_sum = np.square(np.linalg.norm(self.instances, axis=1)).sum()
# self.squared_sum = np.square(instances).sum(axis=0) # From CluSTREAM paper
self.timestamp = timestamp
self.update_properties()
def __str__(self):
return f"""Target class {self.label}
# of instances: {self.n}
Linear sum: {self.linear_sum}
Squared sum: {self.squared_sum}
Centroid: {self.centroid}
Radius: {self.radius}
Timestamp of last change: {self.timestamp}"""
def get_radius(self):
"""Return radius of the subcluster"""
factor = 1.5
# from BIRCH Wikipedia
diff = (self.squared_sum / self.n) - np.dot(self.centroid, self.centroid)
if diff > 1e-15:
return factor * np.sqrt(diff)
else: # in this case diff should be zero, but sometimes it's an infinitesimal difference
return 0
# from MINAS paper:
# return factor*np.std(self.distance_to_centroid(self.instances))
def distance_to_centroid(self, X):
"""Returns distance from X to centroid of this cluster.
Parameters
----------
X : numpy.ndarray
"""
if len(X.shape) == 1: # X is only one point
return np.linalg.norm(X - self.centroid)
else: # X contains several points
return np.linalg.norm(X - self.centroid, axis=1)
def encompasses(self, X):
"""Check if points in X are inside this microcluster.
Parameters
----------
X : numpy.ndarray
Returns
-------
numpy.bool_"""
return np.less(self.distance_to_centroid(X), self.radius)
def find_closest_cluster(self, clusters):
"""Finds closest cluster to this one among passed clusters.
Parameters
----------
clusters : List[minas.MicroCluster]
Returns
-------
minas.MicroCluster
"""
return min(clusters, key=lambda cl: cl.distance_to_centroid(self.centroid))
def update_cluster(self, X, y, timestamp, update_summary):
"""
Parameters
----------
self : minas.MicroCluster
X : numpy.ndarray
X is one point.
y : numpy.int64
Returns
-------
"""
assert len(X.shape) == 1 # it's just one point
self.timestamp = timestamp
self.instances = np.append(self.instances, [X],
axis=0) # TODO: remove later when dropping instances from class
if update_summary:
self.n += 1
self.linear_sum = np.sum([self.linear_sum, X], axis=0)
self.squared_sum = np.sum([self.squared_sum, np.square(X)], axis=0)
self.update_properties()
def update_properties(self):
"""
Update centroid and radius based on current cluster properties.
Returns
-------
None
"""
self.centroid = self.linear_sum / self.n
self.radius = self.get_radius()
def is_cohesive(self, clusters):
"""Verifies if this cluster is cohesive for novelty detection purposes.
A new micro-cluster is cohesive if its silhouette coefficient is larger than 0.
b represents the Euclidean distance between the centroid of the new micro-cluster and the centroid of its
closest micro-cluster, and a represents the standard deviation of the distances between the examples of the
new micro-cluster and the centroid of the new micro-cluster.
Parameters
----------
clusters : List[minas.MicroCluster]
Returns
-------
"""
b = self.distance_to_centroid(self.find_closest_cluster(clusters).centroid)
a = np.std(self.distance_to_centroid(self.instances))
silhouette = (b - a) / max(a, b) # hm, this is always positive if b > a
return silhouette > 0
def is_representative(self, min_examples):
"""Verifies if this cluster is representative for novelty detection purposes.
A new micro-cluster is representative if it contains a minimal number of examples,
where this number is a user-defined parameter.
Parameters
----------
min_examples : int
Returns
-------
bool
"""
return self.n >= min_examples
class ShortMemInstance:
def __init__(self, point, timestamp):
self.point = point
self.timestamp = timestamp
def __eq__(self, other):
"""
I'm considering elements equal if they have the same values for all variables.
This currently does not consider the timestamp.
"""
if type(other) == np.ndarray:
return np.all(self.point == other)