-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBrushEstimator.py
298 lines (220 loc) · 9.69 KB
/
BrushEstimator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
"""
sklearn-compatible wrapper for GP analyses.
See engine.cpp for Python (via pybind11) modules that give more fine-grained
control of the underlying GP objects.
"""
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin, \
RegressorMixin, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from pybrush import Parameters, Dataset, SearchSpace, brush_rng
from pybrush.EstimatorInterface import EstimatorInterface
from pybrush import RegressorEngine, ClassifierEngine, MultiClassifierEngine
class BrushEstimator(EstimatorInterface, BaseEstimator):
"""
This is the base class for Brush estimators using the c++ engine.
Parameters are defined and documented in
:py:class:`EstimatorInterface <pybrush.EstimatorInterface.EstimatorInterface>`
Attributes
----------
best_estimator_ : pybrush.Program
The final model picked from training. Used in subsequent calls to :func:`predict`.
archive_ : list[deap_api.DeapIndividual]
The final population from training.
data_ : pybrush.Dataset
The complete data in Brush format.
train_ : pybrush.Dataset
Partition of `data_` containing `(1-validation_size)`% of the data, in Brush format.
validation_ : pybrush.Dataset
Partition of `data_` containing `(validation_size)`% of the data, in Brush format.
search_space_ : a Brush `SearchSpace` object.
Holds the operators and terminals and sampling utilities to update programs.
"""
def __init__(self, **kwargs):
EstimatorInterface.__init__(self, **kwargs)
def fit(self, X, y):
"""
Fit an estimator to X,y.
Parameters
----------
X : np.ndarray
2-d array of input data.
y : np.ndarray
1-d array of (boolean) target values.
"""
self.feature_names_ = []
if isinstance(X, pd.DataFrame):
self.feature_names_ = X.columns.to_list()
self.data_ = self._make_data(X, y,
feature_names=self.feature_names_,
validation_size=self.validation_size)
# set n classes if relevant
self.n_classes_ = 0
if self.mode=="classification":
self.n_classes_ = len(np.unique(y))
# These have a default behavior to return something meaningfull if
# no values are set
self.train_ = self.data_.get_training_data()
self.train_.set_batch_size(self.batch_size) # TODO: update batch indexes at the beggining of every generation
self.validation_ = self.data_.get_validation_data()
self.parameters_ = self._wrap_parameters(n_classes=self.n_classes_)
self.search_space_ = SearchSpace(self.data_, self.parameters_.functions, self.weights_init)
self.engine_ = None
if self.mode == 'classification':
self.engine_ = ( ClassifierEngine
if self.n_classes_ == 2 else
MultiClassifierEngine)(self.parameters_)
else:
self.engine_ = RegressorEngine(self.parameters_)
self.engine_.fit(self.data_)
self.archive_ = self.engine_.get_archive()
self.best_estimator_ = self.engine_.best_ind
return self
def _make_data(self, X, y=None, feature_names=[], validation_size=0.0):
"""
Prepare the data for training or prediction.
Parameters:
- X: array-like or pandas DataFrame, shape (n_samples, n_features)
The input features.
- y: array-like or pandas Series, shape (n_samples,), optional (default=None)
The target variable.
- feature_names: list, optional (default=[])
The names of the features.
- validation_size: float, optional (default=0.0)
The proportion of the data to be used for validation.
Returns:
- dataset: Dataset
The prepared dataset object containing the input features, target variable,
feature names, and validation size.
"""
# This function should not partition data (since it may be used in `predict`).
# partitioning is done by `fit`. Feature names should be inferred
# before calling _make_data (so predict can be made with np arrays or
# pd dataframes).
if isinstance(y, pd.Series):
y = y.values
if isinstance(X, pd.DataFrame):
X = X.values
assert isinstance(X, np.ndarray)
if y is None:
return Dataset(X=X,
feature_names=feature_names, c=self.mode == "classification",
validation_size=validation_size)
return Dataset(X=X, y=y,
feature_names=feature_names, c=self.mode == "classification",
validation_size=validation_size)
def predict(self, X):
"""Predict using the best estimator in the archive. """
check_is_fitted(self)
if isinstance(X, pd.DataFrame):
X = X.values
assert isinstance(X, np.ndarray)
data = Dataset(X=X, ref_dataset=self.data_, c=self.mode == "classification",
feature_names=self.feature_names_)
# data = self._make_data(X, feature_names=self.feature_names_)
return self.best_estimator_.program.predict(data)
def get_params(self, deep=True):
out = dict()
for (key, value) in self.__dict__.items():
if not key.endswith('_'):
if deep and hasattr(value, "get_params") and not isinstance(value, type):
deep_items = value.get_params().items()
out.update((key + "__" + k, val) for k, val in deep_items)
out[key] = value
return out
def predict_archive(self, X):
"""Returns a list of dictionary predictions for all models."""
check_is_fitted(self)
if isinstance(X, pd.DataFrame):
X = X.values
assert isinstance(X, np.ndarray)
data = Dataset(X=X, ref_dataset=self.data_, c=self.mode == "classification",
feature_names=self.feature_names_)
archive = self.engine_.get_archive()
preds = []
for ind in archive:
tmp = {
'id' : ind['id'],
'y_pred' : self.engine_.predict_archive(ind['id'], data)
}
preds.append(tmp)
return preds
class BrushClassifier(BrushEstimator, ClassifierMixin):
"""Brush with c++ engine for classification.
Parameters are defined and documented in
:py:class:`EstimatorInterface <pybrush.EstimatorInterface.EstimatorInterface>`
This class inherits from :py:class:`BrushEstimator <pybrush.BrushEstimator.BrushEstimator>`.
A full documentation of the methods and attributes can be found there.
Examples
--------
>>> import pandas as pd
>>> df = pd.read_csv('docs/examples/datasets/d_analcatdata_aids.csv')
>>> X = df.drop(columns='target')
>>> y = df['target']
>>> from pybrush import BrushClassifier
>>> est = BrushClassifier()
>>> est.fit(X,y)
>>> # print('score:', est.score(X,y))
"""
def __init__( self, **kwargs):
super().__init__(mode='classification',**kwargs)
def predict_proba(self, X):
"""Predict class probabilities for X.
Parameters
----------
X : {array-like} of shape (n_samples, n_features)
The input samples.
Returns
-------
p : ndarray of shape (n_samples, n_classes)
The class probabilities of the input samples.
"""
check_is_fitted(self)
if isinstance(X, pd.DataFrame):
X = X.values
assert isinstance(X, np.ndarray)
data = Dataset(X=X, ref_dataset=self.data_, c=True,
feature_names=self.feature_names_)
# data = self._make_data(X, feature_names=self.feature_names_)
prob = self.best_estimator_.program.predict_proba(data)
if self.n_classes_ == 2:
prob = np.hstack( (np.ones(X.shape[0]).reshape(-1,1), prob.reshape(-1,1)) )
prob[:, 0] -= prob[:, 1]
return prob
def predict_proba_archive(self, X):
"""Returns a list of dictionary predictions for all models."""
check_is_fitted(self)
if isinstance(X, pd.DataFrame):
X = X.values
assert isinstance(X, np.ndarray)
data = Dataset(X=X, ref_dataset=self.data_, c=True,
feature_names=self.feature_names_)
archive = self.engine_.get_archive()
preds = []
for ind in archive:
tmp = {
'id' : ind['id'],
'y_pred' : self.engine_.predict_proba_archive(ind['id'], data)
}
preds.append(tmp)
return preds
class BrushRegressor(BrushEstimator, RegressorMixin):
"""Brush with c++ engine for regression.
Parameters are defined and documented in
:py:class:`EstimatorInterface <pybrush.EstimatorInterface.EstimatorInterface>`
This class inherits from :py:class:`BrushEstimator <pybrush.BrushEstimator.BrushEstimator>`.
A full documentation of the methods and attributes can be found there.
Examples
--------
>>> import pandas as pd
>>> df = pd.read_csv('docs/examples/datasets/d_enc.csv')
>>> X = df.drop(columns='label')
>>> y = df['label']
>>> from pybrush import BrushRegressor
>>> est = BrushRegressor()
>>> est.fit(X,y)
>>> # print('score:', est.score(X,y))
"""
def __init__(self, **kwargs):
super().__init__(mode='regressor',**kwargs)