-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEstimatorInterface.py
226 lines (212 loc) · 9.87 KB
/
EstimatorInterface.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
"""
Estimator interface for GP implementations.
This interface defines all the hyperparameters for Brush estimators and
provides documentation for the hyperparameters.
"""
import numpy as np
from pybrush import Parameters
class EstimatorInterface():
"""
Interface class for all estimators in pybrush.
Parameters
----------
mode : str, default 'classification'
The mode of the estimator. Used by subclasses
pop_size : int, default 100
Population size.
max_gens : int, default 100
Maximum iterations of the algorithm.
max_time: int, optional (default: -1)
Maximum time terminational criterion in seconds. If -1, not used.
max_stall: int, optional (default: 0)
How many generations to continue after the validation loss has
stalled. If 0, not used.
verbosity : int, default 0
Controls level of printouts.
max_depth : int, default 0
Maximum depth of GP trees in the GP program. Use 0 for no limit.
max_size : int, default 0
Maximum number of nodes in a tree. Use 0 for no limit.
num_islands : int, default 5
Number of independent islands to use in evolutionary framework.
This also corresponds to the number of parallel threads in the c++
engine.
mig_prob : float, default 0.05
Probability of occuring a migration between two random islands at the
end of a generation, must be between 0 and 1.
cx_prob : float, default 1/7
Probability of applying the crossover variation when generating the offspring,
must be between 0 and 1.
Given that there are `n` mutations, and either crossover or mutation is
used to generate each individual in the offspring (but not both at the
same time), we want to have by default an uniform probability between
crossover and every possible mutation. By setting `cx_prob=1/(n+1)`, and
`1/n` for each mutation, we can achieve an uniform distribution.
mutation_probs : dict, default {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6, "toggle_weight_on":1/6, "toggle_weight_off":1/6}
A dictionary with keys naming the types of mutation and floating point
values specifying the fraction of total mutations to do with that method.
The probability of having a mutation is `(1-cx_prob)` and, in case the mutation
is applied, then each mutation option is sampled based on the probabilities
defined in `mutation_probs`. The set of probabilities should add up to 1.0.
functions: dict[str,float] or list[str], default {}
A dictionary with keys naming the function set and values giving the probability
of sampling them, or a list of functions which will be weighted uniformly.
If empty, all available functions are included in the search space.
initialization : {"uniform", "max_size"}, default "uniform"
Distribution of sizes on the initial population. If `max_size`, then every
expression is created with `max_size` nodes. If `uniform`, size will be
uniformly distributed between 1 and `max_size`.
objectives : list[str], default ["error", "size"]
list with one or more objectives to use. Options are `"error", "size", "complexity"`.
If `"error"` is used, then it will be the mean squared error for regression,
and accuracy for classification.
algorithm : {"nsga2island", "nsga2", "gaisland", "ga"}, default "nsga2"
Which Evolutionary Algorithm framework to use to evolve the population.
This is used only in DeapEstimators.
weights_init : bool, default True
Whether the search space should initialize the sampling weights of terminal nodes
based on the correlation with the output y. If `False`, then all terminal nodes
will have the same probability of 1.0.
validation_size : float, default 0.0
Percentage of samples to use as a hold-out partition. These samples are used
to calculate statistics during evolution, but not used to train the models.
The `best_estimator_` will be selected using this partition. If zero, then
the same data used for training is used for validation.
val_from_arch: boolean, optional (default: True)
Validates the final model using the archive rather than the whole
population.
use_arch: boolean, optional (default: False)
Determines if we should save pareto front of the entire evolution
(when set to True) or just the final population (False).
batch_size : float, default 1.0
Percentage of training data to sample every generation. If `1.0`, then
all data is used. Very small values can improve execution time, but
also lead to underfit.
save_population: str, optional (default "")
string containing the path to save the final population. Ignored if
not provided.
load_population: str, optional (default "")
string containing the path to load the initial population. Ignored
if not provided.
shuffle_split: boolean, optional (default False)
whether if the engine should shuffle the data before splitting it
into train and validation partitions. Ignored if `validation_size`
is set to zero.
logfile: str, optional (default: "")
If specified, spits statistics into a logfile. "" means don't log.
random_state: int or None, default None
If int, then the value is used to seed the c++ random generator; if None,
then a seed will be generated using a non-deterministic generator. It is
important to notice that, even if the random state is fixed, it is
unlikely that running brush using multiple threads will have the same
results. This happens because the Operating System's scheduler is
responsible to choose which thread will run at any given time, thus
reproductibility is not guaranteed.
"""
def __init__(self,
mode='classification',
pop_size=100,
max_gens=100,
max_time=-1,
max_stall=0,
verbosity=0,
max_depth=3,
max_size=20,
num_islands=1,
n_jobs=1,
mig_prob=0.05,
cx_prob= 1/7,
mutation_probs = {"point":1/6, "insert":1/6, "delete":1/6, "subtree":1/6,
"toggle_weight_on":1/6, "toggle_weight_off":1/6},
functions: list[str]|dict[str,float] = {},
initialization="uniform",
algorithm="nsga2",
objectives=["error", "size"],
random_state=None,
logfile="",
save_population="",
load_population="",
shuffle_split=False,
weights_init=True,
val_from_arch=True,
use_arch=False,
validation_size: float = 0.0,
batch_size: float = 1.0
):
self.pop_size=pop_size
self.max_gens=max_gens
self.max_stall=max_stall
self.max_time=max_time
self.verbosity=verbosity
self.algorithm=algorithm
self.mode=mode
self.max_depth=max_depth
self.max_size=max_size
self.num_islands=num_islands
self.mig_prob=mig_prob
self.n_jobs=n_jobs
self.cx_prob=cx_prob
self.logfile=logfile
self.save_population=save_population
self.load_population=load_population
self.mutation_probs=mutation_probs
self.val_from_arch=val_from_arch # TODO: val from arch implementation (in cpp side)
self.use_arch=use_arch
self.functions=functions
self.objectives=objectives
self.shuffle_split=shuffle_split
self.initialization=initialization
self.random_state=random_state
self.batch_size=batch_size
self.weights_init=weights_init
self.validation_size=validation_size
def _wrap_parameters(self, **extra_kwargs):
"""
Creates a `Parameters` class to send to c++ backend the settings for
the algorithm to use.
"""
if isinstance(self.functions, list):
self.functions_ = {k:1.0 for k in self.functions}
else:
self.functions_ = self.functions
params = Parameters()
params.classification = self.mode == "classification"
params.n_classes = self.n_classes_
params.verbosity = self.verbosity
params.n_jobs = self.n_jobs
params.pop_size = self.pop_size
params.max_gens = self.max_gens
params.logfile = self.logfile
params.save_population = self.save_population
params.load_population = self.load_population
params.max_stall = self.max_stall
params.max_time = self.max_time
params.num_islands = self.num_islands
params.max_depth = self.max_depth
params.max_size = self.max_size
params.objectives = self.objectives
params.shuffle_split = self.shuffle_split
params.cx_prob = self.cx_prob
params.use_arch = self.use_arch
params.val_from_arch = self.val_from_arch
params.mig_prob = self.mig_prob
params.functions = self.functions_
params.mutation_probs = self.mutation_probs
params.validation_size = self.validation_size
params.batch_size = self.batch_size
params.feature_names = self.feature_names_
params.scorer_ = "mse"
if self.mode == "classification":
params.scorer_ = "log" if self.n_classes_ == 2 else "multi_log"
if self.random_state is not None:
seed = 0
if isinstance(self.random_state, np.random.Generator):
seed = self.random_state.integers(1_000_000)
elif isinstance(self.random_state, int):
seed = self.random_state
else:
raise ValueError("random_state must be either a numpy random generator or an integer")
params.random_state = seed
for k, v in extra_kwargs.items():
setattr(params, k, v)
return params