-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseed_spreader.py
169 lines (144 loc) · 5.79 KB
/
seed_spreader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import numpy as np
import matplotlib.pyplot as plt
from numpy.random import PCG64
main_generator = np.random.Generator(PCG64(0))
# set seed for data generator
def set_seed(i):
global main_generator
main_generator = np.random.Generator(PCG64(i))
# Seed Spreader as described in DBSCAN Revisited
# Junhao Gan and Yufei Tao. "DBSCAN revisited: Mis-claim, un-fixability, and approximation."
# Proceedings of the 2015 ACM SIGMOD international conference on management of data. 2015.
# Junhao Gan and Yufei Tao. "On the Hardness and Approximation of Euclidean DBSCAN",
# ACM Transactions on Database Systems (TODS), vol. 42, no. 3, pp. 1–45, 2017
# obtain n uniformly sampled points within a d-sphere with a fixed radius around a given point. Assigns all points to given cluster
# code partially based on code provided here http://extremelearning.com.au/how-to-generate-uniformly-random-points-on-n-spheres-and-n-balls/
def random_ball_num(center, radius, d, n, clunum):
d = int(d)
n = int(n)
u = main_generator.normal(0, 1, (n, d + 1)) # an array of d normally distributed random variables
norm = np.sqrt(np.sum(u[:,:-1] ** 2, 1))
r = main_generator.random(n) ** (1.0 / d)
normed = np.divide(u, norm[:, None])
x = r[:, None] * normed
x[:, :-1] = center + x[:, :-1] * radius
x[:, -1] = clunum
return x
def random_ball_num_bias(center, radius, d, n, clunum):
d = int(d)
n = int(n)
u = main_generator.normal(0, 1, (n, d + 1)) # an array of d normally distributed random variables
norm = np.sqrt(np.sum(u ** 2, 1))
r = main_generator.random(n) ** (1.0 / d)
normed = np.divide(u, norm[:, None])
x = r[:, None] * normed
x[:, :-1] = center + x[:, :-1] * radius
x[:, -1] = clunum
return x
def seedSpreader(n = 2000000, dim=5, ratio_noise=0.001, domain_size=100000, reset_counter= 100, restart_chance_mult = 10,
radius=100, seed=0, verbose =False, noise_adapt=False, var_density = False, step = False, gen = "uniform"):
"""
Seed Spreader generator function
:param n: number of data points
:param dim: dimensionality
:param ratio_noise: noise ratio
:param domain_size: domain size
:param reset_counter: counter for hypersphere points
:param restart_chance_mult: cluster restart overall
:param radius: radius of hypersphere
:param var_density: density based mode introduced in “On the hardness and approximation of euclidean dbscan,”, uses ((i mod 10) + 1) as factor
:param seed: seed
:param verbose: verbose
:param noise_adapt: adapt noise to altered domain size after random walk
:param step: overwrite step, otherwise radius/(2*dim)
:param gen: use different distribution for data point placement (only supports 'paper' (for paper version) aside from default uniform)
:return: data points as np.array of shape (dim+1, data_num), last column is cluster id
"""
if not step:
step = radius/2*dim
if verbose:
print("dim: ", dim)
print("n: ", n)
print("reset_counter", reset_counter)
print("base radius: ", radius)
print("base step:", step)
set_seed(seed)
ratio_noise = ratio_noise
reset_counter = reset_counter
num_data = round((n* (1-ratio_noise)))
restart_chance = restart_chance_mult/num_data
verbose = verbose
points = []
pos = main_generator.random(dim) * domain_size
counter = reset_counter
cluid = 0
while (len(points) < num_data):
if var_density:
factor = (cluid % 10) + 1
else:
factor = 1
rand = main_generator.random()
if rand < restart_chance:
if(verbose):
print("restart occured")
pos = main_generator.random(dim) * domain_size
counter = reset_counter
if len(points) > 0:
cluid += 1
if gen == "paper":
point = random_ball_num(pos, radius* factor, dim, 1, cluid)
else:
point = random_ball_num_bias(pos, radius* factor, dim, 1, cluid)
points.append(point[0])
counter -= 1
if counter == 0:
step_dir = (main_generator.random(dim) - 0.5)
step_dir = step_dir / np.linalg.norm(step_dir)
#print(pos)
pos = pos + (step_dir * step * factor)
counter = reset_counter
#print(pos)
if verbose:
print("step occured")
points_np = np.array(points)
mins = []
maxs = []
maxall = -1 * np.inf
minall = np.inf
for d in range(dim):
maxs.append(np.max(points_np[:,d]))
mins.append(np.min(points_np[:, d]))
if maxall < maxs[d]:
maxall = maxs[d]
if minall > mins[d]:
minall = mins[d]
noise = main_generator.random([n-num_data, dim + 1])
if noise_adapt=="square":
dspan = maxall - minall
for d in range(dim):
noise[:, d] = (noise[:, d] * dspan * 1.2) + minall - 0.1 * dspan
elif noise_adapt==True or noise_adapt=="dim":
for d in range(dim):
dspan = maxs[d] - mins[d]
noise[:, d] = (noise[:, d] * dspan * 1.2) + mins[d] - 0.1 * dspan
else:
noise = noise*domain_size
noise[:, -1] = -1
points.extend(noise)
return np.array(points)
if __name__ == '__main__':
dim = 2
noise_ratio = 0.001
data = seedSpreader(dim=dim, ratio_noise=noise_ratio, noise_adapt=True, var_density = True)
print(data)
datax = data[:, 0:-1]
print(datax.shape)
datay = data[:, -1]
print(datay.shape)
plt.figure(figsize=(15, 15))
color = plt.cm.tab20(np.linspace(0, 1, len(np.unique(datay))))
color = np.append(color, [[0, 0, 0, 1]], axis=0)
#print(color)
plt.scatter(datax[:, 0], datax[:, 1], c=color[datay.astype('int32')])
plt.axis('scaled')
plt.show()