Skip to content

Commit 47bd54f

Browse files
rijkvandermeulenRijk van der Meulenmadtoinoudennisbader
authored andcommitted
Functionality to let LightGBM effectively handle categorical features (unit8co#1585)
* unit8co#1580 exploration * unit8co#1580 added cat_components to TimeSeries * unit8co#1580 _fit_model method LightGBM * unit8co#1580 included static covs in dummy unit test * unit8co#1580 integration with lgbm * unit8co#1580 helper func to method in RegressionModel * unit8co#1580 different approach; pass categorical covs to fit method of lgbm directly instead of attributes TimeSeries object * unit8co#1580 added few unit tests * unit8co#1580 small stuff * unit8co#1580 move categorical covs to model constructor * unit8co#1580 avoid code duplication in unit tests * unit8co#1580 add unit test on forecast quality with cat covs * unit8co#1580 add column names check in _get_categorical_covs helper * unit8co#1580 docstrings lgbm * unit8co#1580 add changelog entry * unit8co#1580 change check if ts has static cov * unit8co#1580 implemented RegressionModelWithCategoricalCovariates class * unit8co#1580 delete redundant test * unit8co#1580 replace test_quality_forecast_with_categorical_covariates unit test by suggestion Dennis * unit8co#1580 adjustment error messages validation method * unit8co#1580 adding categorical feature support for CatBoost * unit8co#1580 remove cat support CatBoost and smaller comments Dennis * unit8co#1580 finalizing * unit8co#1580 use parent _fit_model method * avoid creating lagged data twice * remove empty lines --------- Co-authored-by: Rijk van der Meulen <[email protected]> Co-authored-by: madtoinou <[email protected]> Co-authored-by: Dennis Bader <[email protected]>
1 parent 56ba438 commit 47bd54f

File tree

6 files changed

+604
-6
lines changed

6 files changed

+604
-6
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
1010
[#1545](https://github.com/unit8co/darts/pull/1545) by [Rijk van der Meulen](https://github.com/rijkvandermeulen).
1111

1212
[Full Changelog](https://github.com/unit8co/darts/compare/0.23.1...master)
13+
- `LightGBM` model now supports native categorical feature handling as described
14+
[here](https://lightgbm.readthedocs.io/en/latest/Features.html#optimal-split-for-categorical-features).
15+
[#1585](https://github.com/unit8co/darts/pull/1585) by [Rijk van der Meulen](https://github.com/rijkvandermeulen)
1316

1417
## [0.23.1](https://github.com/unit8co/darts/tree/0.23.1) (2023-01-12)
1518
Patch release

darts/models/forecasting/lgbm.py

+25-3
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,16 @@
1616
import numpy as np
1717

1818
from darts.logging import get_logger
19-
from darts.models.forecasting.regression_model import RegressionModel, _LikelihoodMixin
19+
from darts.models.forecasting.regression_model import (
20+
RegressionModelWithCategoricalCovariates,
21+
_LikelihoodMixin,
22+
)
2023
from darts.timeseries import TimeSeries
2124

2225
logger = get_logger(__name__)
2326

2427

25-
class LightGBMModel(RegressionModel, _LikelihoodMixin):
28+
class LightGBMModel(RegressionModelWithCategoricalCovariates, _LikelihoodMixin):
2629
def __init__(
2730
self,
2831
lags: Union[int, list] = None,
@@ -34,6 +37,9 @@ def __init__(
3437
quantiles: List[float] = None,
3538
random_state: Optional[int] = None,
3639
multi_models: Optional[bool] = True,
40+
categorical_past_covariates: Optional[Union[str, List[str]]] = None,
41+
categorical_future_covariates: Optional[Union[str, List[str]]] = None,
42+
categorical_static_covariates: Optional[Union[str, List[str]]] = None,
3743
**kwargs,
3844
):
3945
"""LGBM Model
@@ -87,6 +93,20 @@ def __init__(
8793
multi_models
8894
If True, a separate model will be trained for each future lag to predict. If False, a single model is
8995
trained to predict at step 'output_chunk_length' in the future. Default: True.
96+
categorical_past_covariates
97+
Optionally, component name or list of component names specifying the past covariates that should be treated
98+
as categorical by the underlying `lightgbm.LightGBMRegressor`. It's recommended that the components that
99+
are treated as categorical are integer-encoded. For more information on how LightGBM handles categorical
100+
features, visit: `Categorical feature support documentation
101+
<https://lightgbm.readthedocs.io/en/latest/Features.html#optimal-split-for-categorical-features>`_
102+
categorical_future_covariates
103+
Optionally, component name or list of component names specifying the future covariates that should be
104+
treated as categorical by the underlying `lightgbm.LightGBMRegressor`. It's recommended that the components
105+
that are treated as categorical are integer-encoded.
106+
categorical_static_covariates
107+
Optionally, string or list of strings specifying the static covariates that should be treated as categorical
108+
by the underlying `lightgbm.LightGBMRegressor`. It's recommended that the static covariates that are
109+
treated as categorical are integer-encoded.
90110
**kwargs
91111
Additional keyword arguments passed to `lightgbm.LGBRegressor`.
92112
"""
@@ -117,6 +137,9 @@ def __init__(
117137
add_encoders=add_encoders,
118138
multi_models=multi_models,
119139
model=lgb.LGBMRegressor(**self.kwargs),
140+
categorical_past_covariates=categorical_past_covariates,
141+
categorical_future_covariates=categorical_future_covariates,
142+
categorical_static_covariates=categorical_static_covariates,
120143
)
121144

122145
def fit(
@@ -157,7 +180,6 @@ def fit(
157180
**kwargs
158181
Additional kwargs passed to `lightgbm.LGBRegressor.fit()`
159182
"""
160-
161183
if val_series is not None:
162184
kwargs["eval_set"] = self._create_lagged_data(
163185
target_series=val_series,

darts/models/forecasting/regression_model.py

+302-2
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
When static covariates are present, they are appended to the lagged features. When multiple time series are passed,
2727
if their static covariates do not have the same size, the shorter ones are padded with 0 valued features.
2828
"""
29-
3029
from collections import OrderedDict
3130
from typing import List, Optional, Sequence, Tuple, Union
3231

@@ -38,7 +37,12 @@
3837
from darts.timeseries import TimeSeries
3938
from darts.utils.data.tabularization import create_lagged_training_data
4039
from darts.utils.multioutput import MultiOutputRegressor
41-
from darts.utils.utils import _check_quantiles, seq2series, series2seq
40+
from darts.utils.utils import (
41+
_check_quantiles,
42+
get_single_series,
43+
seq2series,
44+
series2seq,
45+
)
4246

4347
logger = get_logger(__name__)
4448

@@ -1029,3 +1033,299 @@ def _quantile_sampling(self, model_output: np.ndarray) -> np.ndarray:
10291033
class _QuantileModelContainer(OrderedDict):
10301034
def __init__(self):
10311035
super().__init__()
1036+
1037+
1038+
class RegressionModelWithCategoricalCovariates(RegressionModel):
1039+
def __init__(
1040+
self,
1041+
lags: Union[int, list] = None,
1042+
lags_past_covariates: Union[int, List[int]] = None,
1043+
lags_future_covariates: Union[Tuple[int, int], List[int]] = None,
1044+
output_chunk_length: int = 1,
1045+
add_encoders: Optional[dict] = None,
1046+
model=None,
1047+
multi_models: Optional[bool] = True,
1048+
categorical_past_covariates: Optional[Union[str, List[str]]] = None,
1049+
categorical_future_covariates: Optional[Union[str, List[str]]] = None,
1050+
categorical_static_covariates: Optional[Union[str, List[str]]] = None,
1051+
):
1052+
"""
1053+
Extension of `RegressionModel` for regression models that support categorical covariates.
1054+
1055+
Parameters
1056+
----------
1057+
lags
1058+
Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags
1059+
are used (from -1 backward). Otherwise, a list of integers with lags is required (each lag must be < 0).
1060+
lags_past_covariates
1061+
Number of lagged past_covariates values used to predict the next time step. If an integer is given the last
1062+
`lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers
1063+
with lags < 0 is required.
1064+
lags_future_covariates
1065+
Number of lagged future_covariates values used to predict the next time step. If a tuple (past, future) is
1066+
given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first
1067+
`future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list
1068+
of integers with lags is required.
1069+
output_chunk_length
1070+
Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast
1071+
horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may
1072+
be useful if the covariates don't extend far enough into the future.
1073+
add_encoders
1074+
A large number of past and future covariates can be automatically generated with `add_encoders`.
1075+
This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
1076+
will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
1077+
transform the generated covariates. This happens all under one hood and only needs to be specified at
1078+
model creation.
1079+
Read :meth:`SequentialEncoder <darts.dataprocessing.encoders.SequentialEncoder>` to find out more about
1080+
``add_encoders``. Default: ``None``. An example showing some of ``add_encoders`` features:
1081+
1082+
.. highlight:: python
1083+
.. code-block:: python
1084+
1085+
add_encoders={
1086+
'cyclic': {'future': ['month']},
1087+
'datetime_attribute': {'future': ['hour', 'dayofweek']},
1088+
'position': {'past': ['relative'], 'future': ['relative']},
1089+
'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
1090+
'transformer': Scaler()
1091+
}
1092+
..
1093+
model
1094+
Scikit-learn-like model with ``fit()`` and ``predict()`` methods. Also possible to use model that doesn't
1095+
support multi-output regression for multivariate timeseries, in which case one regressor
1096+
will be used per component in the multivariate series.
1097+
If None, defaults to: ``sklearn.linear_model.LinearRegression(n_jobs=-1)``.
1098+
multi_models
1099+
If True, a separate model will be trained for each future lag to predict. If False, a single model is
1100+
trained to predict at step 'output_chunk_length' in the future. Default: True.
1101+
categorical_past_covariates
1102+
Optionally, component name or list of component names specifying the past covariates that should be treated
1103+
as categorical.
1104+
categorical_future_covariates
1105+
Optionally, component name or list of component names specifying the future covariates that should be
1106+
treated as categorical.
1107+
categorical_static_covariates
1108+
Optionally, string or list of strings specifying the static covariates that should be treated as
1109+
categorical.
1110+
"""
1111+
super().__init__(
1112+
lags=lags,
1113+
lags_past_covariates=lags_past_covariates,
1114+
lags_future_covariates=lags_future_covariates,
1115+
output_chunk_length=output_chunk_length,
1116+
add_encoders=add_encoders,
1117+
model=model,
1118+
multi_models=multi_models,
1119+
)
1120+
self.categorical_past_covariates = (
1121+
[categorical_past_covariates]
1122+
if isinstance(categorical_past_covariates, str)
1123+
else categorical_past_covariates
1124+
)
1125+
self.categorical_future_covariates = (
1126+
[categorical_future_covariates]
1127+
if isinstance(categorical_future_covariates, str)
1128+
else categorical_future_covariates
1129+
)
1130+
self.categorical_static_covariates = (
1131+
[categorical_static_covariates]
1132+
if isinstance(categorical_static_covariates, str)
1133+
else categorical_static_covariates
1134+
)
1135+
1136+
def fit(
1137+
self,
1138+
series: Union[TimeSeries, Sequence[TimeSeries]],
1139+
past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
1140+
future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
1141+
max_samples_per_ts: Optional[int] = None,
1142+
n_jobs_multioutput_wrapper: Optional[int] = None,
1143+
**kwargs,
1144+
):
1145+
self._validate_categorical_covariates(
1146+
series=series,
1147+
past_covariates=past_covariates,
1148+
future_covariates=future_covariates,
1149+
)
1150+
super().fit(
1151+
series=series,
1152+
past_covariates=past_covariates,
1153+
future_covariates=future_covariates,
1154+
max_samples_per_ts=max_samples_per_ts,
1155+
n_jobs_multioutput_wrapper=n_jobs_multioutput_wrapper,
1156+
**kwargs,
1157+
)
1158+
1159+
@property
1160+
def _categorical_fit_param_name(self) -> str:
1161+
"""
1162+
Returns the name of the parameter of the model's `fit` method that specifies the categorical features.
1163+
Can be overridden in subclasses.
1164+
"""
1165+
return "categorical_feature"
1166+
1167+
def _validate_categorical_covariates(
1168+
self,
1169+
series: Union[TimeSeries, Sequence[TimeSeries]],
1170+
past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
1171+
future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
1172+
) -> None:
1173+
"""
1174+
Checks that the categorical covariates are valid. Specifically, checks that the categorical covariates
1175+
of the model are a subset of all covariates.
1176+
1177+
Parameters
1178+
----------
1179+
series
1180+
TimeSeries or Sequence[TimeSeries] object containing the target values.
1181+
past_covariates
1182+
Optionally, a series or sequence of series specifying past-observed covariates
1183+
future_covariates
1184+
Optionally, a series or sequence of series specifying future-known covariates
1185+
"""
1186+
for categorical_covariates, covariates, cov_type in zip(
1187+
[self.categorical_past_covariates, self.categorical_future_covariates],
1188+
[past_covariates, future_covariates],
1189+
["past_covariates", "future_covariates"],
1190+
):
1191+
if categorical_covariates:
1192+
if not covariates:
1193+
raise_log(
1194+
ValueError(
1195+
f"`categorical_{cov_type}` were declared at model creation but no "
1196+
f"`{cov_type}` are passed to the `fit()` call."
1197+
),
1198+
)
1199+
s = get_single_series(covariates)
1200+
if not set(categorical_covariates).issubset(set(s.components)):
1201+
raise_log(
1202+
ValueError(
1203+
f"Some `categorical_{cov_type}` components "
1204+
f"({set(categorical_covariates) - set(s.components)}) "
1205+
f"declared at model creation are not present in the `{cov_type}` "
1206+
f"passed to the `fit()` call."
1207+
)
1208+
)
1209+
if self.categorical_static_covariates:
1210+
s = get_single_series(series)
1211+
covariates = s.static_covariates
1212+
if not s.has_static_covariates:
1213+
raise_log(
1214+
ValueError(
1215+
"`categorical_static_covariates` were declared at model creation but `series`"
1216+
"passed to the `fit()` call does not contain `static_covariates`."
1217+
),
1218+
)
1219+
if not set(self.categorical_static_covariates).issubset(
1220+
set(covariates.columns)
1221+
):
1222+
raise_log(
1223+
ValueError(
1224+
f"Some `categorical_static_covariates` components "
1225+
f"({set(self.categorical_static_covariates) - set(covariates.columns)}) "
1226+
f"declared at model creation are not present in the series' `static_covariates` "
1227+
f"passed to the `fit()` call."
1228+
)
1229+
)
1230+
1231+
def _get_categorical_features(
1232+
self,
1233+
series: Union[List[TimeSeries], TimeSeries],
1234+
past_covariates: Optional[Union[List[TimeSeries], TimeSeries]] = None,
1235+
future_covariates: Optional[Union[List[TimeSeries], TimeSeries]] = None,
1236+
) -> Tuple[List[int], List[str]]:
1237+
"""
1238+
Returns the indices and column names of the categorical features in the regression model.
1239+
1240+
Steps:
1241+
1. Get the list of features used in the model. We keep the creation order of the different lags/features
1242+
in create_lagged_data.
1243+
2. Get the indices of the categorical features in the list of features.
1244+
"""
1245+
1246+
categorical_covariates = (
1247+
(
1248+
self.categorical_past_covariates
1249+
if self.categorical_past_covariates
1250+
else []
1251+
)
1252+
+ (
1253+
self.categorical_future_covariates
1254+
if self.categorical_future_covariates
1255+
else []
1256+
)
1257+
+ (
1258+
self.categorical_static_covariates
1259+
if self.categorical_static_covariates
1260+
else []
1261+
)
1262+
)
1263+
1264+
if not categorical_covariates:
1265+
return [], []
1266+
else:
1267+
target_ts = get_single_series(series)
1268+
past_covs_ts = get_single_series(past_covariates)
1269+
fut_covs_ts = get_single_series(future_covariates)
1270+
1271+
# We keep the creation order of the different lags/features in create_lagged_data
1272+
feature_list = (
1273+
[
1274+
f"target_{component}_lag{lag}"
1275+
for lag in self.lags.get("target", [])
1276+
for component in target_ts.components
1277+
]
1278+
+ [
1279+
f"past_cov_{component}_lag{lag}"
1280+
for lag in self.lags.get("past", [])
1281+
for component in past_covs_ts.components
1282+
]
1283+
+ [
1284+
f"fut_cov_{component}_lag{lag}"
1285+
for lag in self.lags.get("future", [])
1286+
for component in fut_covs_ts.components
1287+
]
1288+
+ (
1289+
list(target_ts.static_covariates.columns)
1290+
if target_ts.has_static_covariates
1291+
# if isinstance(target_ts.static_covariates, pd.DataFrame)
1292+
else []
1293+
)
1294+
)
1295+
1296+
indices = [
1297+
i
1298+
for i, col in enumerate(feature_list)
1299+
for cat in categorical_covariates
1300+
if cat and cat in col
1301+
]
1302+
col_names = [feature_list[i] for i in indices]
1303+
1304+
return indices, col_names
1305+
1306+
def _fit_model(
1307+
self,
1308+
target_series,
1309+
past_covariates,
1310+
future_covariates,
1311+
max_samples_per_ts,
1312+
**kwargs,
1313+
):
1314+
"""
1315+
Custom fit function for `RegressionModelWithCategoricalCovariates` models, adding logic to let the model
1316+
handle categorical features directly.
1317+
"""
1318+
cat_col_indices, _ = self._get_categorical_features(
1319+
target_series,
1320+
past_covariates,
1321+
future_covariates,
1322+
)
1323+
1324+
kwargs[self._categorical_fit_param_name] = cat_col_indices
1325+
super()._fit_model(
1326+
target_series=target_series,
1327+
past_covariates=past_covariates,
1328+
future_covariates=future_covariates,
1329+
max_samples_per_ts=max_samples_per_ts,
1330+
**kwargs,
1331+
)

0 commit comments

Comments
 (0)