-
Notifications
You must be signed in to change notification settings - Fork 50
/
Copy pathxgb.py
156 lines (137 loc) · 6.26 KB
/
xgb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
====================================================================================
Distribute hyperparameter tuning with gradient boosting trees via DistGridSearchCV
====================================================================================
In this example we train a classifier and regression with XGBoost by distributing
the hyperparameter tuning through DistGridSearchCV. This should work right out of the
box with XGBoost's sklearn wrapper.
Given the sequential nature of training estimators on gradient boosting trees, it
makes sense to distribute the hyperparameters and cross validation folds, rather than
trying to train multiple estimators in parallel. Skdist excels in this functionality by
leveraging DistGridSearchCV. In this example, we are able to train 54 unique sets of
hyperparameters in parallel and return the the best model to the driver.
NOTE: This example uses xgboost==0.90
Here is a sample output run:
-- Grid Search --
Best Score: 0.9936882800963308
Best colsample_bytree: 1.0
Best learning_rate: 0.05
Best max_depth: 4
Best n_estimators: 300
DistGridSearchCV(cv=5, error_score='raise-deprecating',
estimator=XGBClassifier(base_score=0.5, booster='gbtree',
colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1,
gamma=0, learning_rate=0.1,
max_delta_step=0, max_depth=3,
min_child_weight=1, missing=nan,
n_estimators=100, n_jobs=1,
nthread=None,
objective='binary:logistic',
random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1,
seed=None, silent=None, subsample=1,
verbosity=1),
iid='warn', n_jobs=None,
param_grid={'colsample_bytree': [0.6, 0.8, 1.0],
'learning_rate': [0.05, 0.01],
'max_depth': [4, 6, 8],
'n_estimators': [100, 200, 300]},
partitions='auto', pre_dispatch='2*n_jobs', preds=False,
refit=True, return_train_score=False, sc=None,
scoring='roc_auc', verbose=0)
-- Grid Search --
Best Score: -18.452273211144295
Best colsample_bytree: 0.8
Best learning_rate: 0.05
Best max_depth: 4
Best n_estimators: 200
DistGridSearchCV(cv=5, error_score='raise-deprecating',
estimator=XGBRegressor(base_score=0.5, booster='gbtree',
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, gamma=0,
importance_type='gain',
learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1,
missing=nan, n_estimators=100, n_jobs=1,
nthread=None,
objective='reg:squarederror',
random...
reg_lambda=1, scale_pos_weight=1,
seed=None, silent=None, subsample=1,
verbosity=1),
iid='warn', n_jobs=None,
param_grid={'colsample_bytree': [0.6, 0.8, 1.0],
'learning_rate': [0.05, 0.01],
'max_depth': [4, 6, 8],
'n_estimators': [100, 200, 300]},
partitions='auto', pre_dispatch='2*n_jobs', preds=False,
refit=True, return_train_score=False, sc=None,
scoring='neg_mean_squared_error', verbose=0)
"""
print(__doc__)
import pickle
from skdist.distribute.search import DistGridSearchCV
from sklearn.datasets import load_breast_cancer, load_boston
from xgboost import XGBClassifier, XGBRegressor
from pyspark.sql import SparkSession
# spark session initialization
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
### XGBClassifier ###
# sklearn variables
cv = 5
clf_scoring = "roc_auc"
reg_scoring = "neg_mean_squared_error"
# load sample data (binary target)
data = load_breast_cancer()
X = data["data"]
y = data["target"]
grid = dict(
learning_rate=[0.05, 0.01],
max_depth=[4, 6, 8],
colsample_bytree=[0.6, 0.8, 1.0],
n_estimators=[100, 200, 300],
)
### distributed grid search
model = DistGridSearchCV(XGBClassifier(), grid, sc, cv=cv, scoring=clf_scoring)
# distributed fitting with spark
model.fit(X, y)
# predictions on the driver
preds = model.predict(X)
probs = model.predict_proba(X)
# results
print("-- Grid Search --")
print("Best Score: {0}".format(model.best_score_))
print("Best colsample_bytree: {0}".format(model.best_estimator_.colsample_bytree))
print("Best learning_rate: {0}".format(model.best_estimator_.learning_rate))
print("Best max_depth: {0}".format(model.best_estimator_.max_depth))
print("Best n_estimators: {0}".format(model.best_estimator_.n_estimators))
print(pickle.loads(pickle.dumps(model)))
### XGBRegressor ###
# load sample data (continuous target)
data = load_boston()
X = data["data"]
y = data["target"]
grid = dict(
learning_rate=[0.05, 0.01],
max_depth=[4, 6, 8],
colsample_bytree=[0.6, 0.8, 1.0],
n_estimators=[100, 200, 300],
)
### distributed grid search
model = DistGridSearchCV(
XGBRegressor(objective="reg:squarederror"), grid, sc, cv=cv, scoring=reg_scoring
)
# distributed fitting with spark
model.fit(X, y)
# predictions on the driver
preds = model.predict(X)
# results
print("-- Grid Search --")
print("Best Score: {0}".format(model.best_score_))
print("Best colsample_bytree: {0}".format(model.best_estimator_.colsample_bytree))
print("Best learning_rate: {0}".format(model.best_estimator_.learning_rate))
print("Best max_depth: {0}".format(model.best_estimator_.max_depth))
print("Best n_estimators: {0}".format(model.best_estimator_.n_estimators))
print(pickle.loads(pickle.dumps(model)))