Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Functionality to let LightGBM effectively handle categorical features #1585

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
ec8fe6d
#1580 exploration
Feb 20, 2023
5ee855d
#1580 added cat_components to TimeSeries
Feb 20, 2023
149b2c7
#1580 _fit_model method LightGBM
Feb 20, 2023
b02e8b1
#1580 included static covs in dummy unit test
Feb 20, 2023
948be36
#1580 integration with lgbm
Feb 20, 2023
3c2fee2
#1580 helper func to method in RegressionModel
Feb 21, 2023
c3d642f
#1580 different approach; pass categorical covs to fit method of lgbm…
Feb 21, 2023
5679eeb
#1580 added few unit tests
Feb 21, 2023
ef7fcf8
#1580 small stuff
Feb 21, 2023
5a5a09f
Merge branch 'master' into feature/use_model_native_way_cat_features
madtoinou Feb 22, 2023
4c5b140
#1580 move categorical covs to model constructor
Feb 27, 2023
f6b25fc
#1580 avoid code duplication in unit tests
Feb 28, 2023
e7cde27
#1580 add unit test on forecast quality with cat covs
Feb 28, 2023
d8aa69f
#1580 add column names check in _get_categorical_covs helper
Feb 28, 2023
5be4f4c
#1580 docstrings lgbm
Feb 28, 2023
dc9ceeb
#1580 add changelog entry
Feb 28, 2023
713a850
Merge branch 'feature/use_model_native_way_cat_features' of https://g…
Feb 28, 2023
165d1bc
#1580 change check if ts has static cov
Feb 28, 2023
d02d3a0
Merge branch 'master' into feature/use_model_native_way_cat_features
Feb 28, 2023
95bf521
Merge branch 'master' into feature/use_model_native_way_cat_features
dennisbader Mar 5, 2023
9df90ae
#1580 implemented RegressionModelWithCategoricalCovariates class
Mar 12, 2023
36e56de
#1580 delete redundant test
Mar 12, 2023
e85bad2
#1580 replace test_quality_forecast_with_categorical_covariates unit …
Mar 12, 2023
9ba3190
#1580 adjustment error messages validation method
Mar 12, 2023
5f2535b
#1580 adding categorical feature support for CatBoost
Mar 12, 2023
ae1d4df
#1580 remove cat support CatBoost and smaller comments Dennis
Mar 27, 2023
7cb8c72
#1580 finalizing
Mar 27, 2023
20073fe
Merge branch 'master' into feature/use_model_native_way_cat_features
Mar 27, 2023
6eb4ed4
#1580 use parent _fit_model method
Mar 27, 2023
5dc1341
Merge branch 'master' into feature/use_model_native_way_cat_features
Mar 27, 2023
fc41cd8
avoid creating lagged data twice
dennisbader Mar 27, 2023
0836ff2
remove empty lines
dennisbader Mar 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
[#1545](https://github.com/unit8co/darts/pull/1545) by [Rijk van der Meulen](https://github.com/rijkvandermeulen).

[Full Changelog](https://github.com/unit8co/darts/compare/0.23.1...master)
- `LightGBM` model now supports native categorical feature handling as described
[here](https://lightgbm.readthedocs.io/en/latest/Features.html#optimal-split-for-categorical-features).
[#1585](https://github.com/unit8co/darts/pull/1585) by [Rijk van der Meulen](https://github.com/rijkvandermeulen)

## [0.23.1](https://github.com/unit8co/darts/tree/0.23.1) (2023-01-12)
Patch release
Expand Down
28 changes: 25 additions & 3 deletions darts/models/forecasting/lgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,16 @@
import numpy as np

from darts.logging import get_logger
from darts.models.forecasting.regression_model import RegressionModel, _LikelihoodMixin
from darts.models.forecasting.regression_model import (
RegressionModelWithCategoricalCovariates,
_LikelihoodMixin,
)
from darts.timeseries import TimeSeries

logger = get_logger(__name__)


class LightGBMModel(RegressionModel, _LikelihoodMixin):
class LightGBMModel(RegressionModelWithCategoricalCovariates, _LikelihoodMixin):
def __init__(
self,
lags: Union[int, list] = None,
Expand All @@ -34,6 +37,9 @@ def __init__(
quantiles: List[float] = None,
random_state: Optional[int] = None,
multi_models: Optional[bool] = True,
categorical_past_covariates: Optional[Union[str, List[str]]] = None,
categorical_future_covariates: Optional[Union[str, List[str]]] = None,
categorical_static_covariates: Optional[Union[str, List[str]]] = None,
**kwargs,
):
"""LGBM Model
Expand Down Expand Up @@ -87,6 +93,20 @@ def __init__(
multi_models
If True, a separate model will be trained for each future lag to predict. If False, a single model is
trained to predict at step 'output_chunk_length' in the future. Default: True.
categorical_past_covariates
Optionally, component name or list of component names specifying the past covariates that should be treated
as categorical by the underlying `lightgbm.LightGBMRegressor`. It's recommended that the components that
are treated as categorical are integer-encoded. For more information on how LightGBM handles categorical
features, visit: `Categorical feature support documentation
<https://lightgbm.readthedocs.io/en/latest/Features.html#optimal-split-for-categorical-features>`_
categorical_future_covariates
Optionally, component name or list of component names specifying the future covariates that should be
treated as categorical by the underlying `lightgbm.LightGBMRegressor`. It's recommended that the components
that are treated as categorical are integer-encoded.
categorical_static_covariates
Optionally, string or list of strings specifying the static covariates that should be treated as categorical
by the underlying `lightgbm.LightGBMRegressor`. It's recommended that the static covariates that are
treated as categorical are integer-encoded.
**kwargs
Additional keyword arguments passed to `lightgbm.LGBRegressor`.
"""
Expand Down Expand Up @@ -117,6 +137,9 @@ def __init__(
add_encoders=add_encoders,
multi_models=multi_models,
model=lgb.LGBMRegressor(**self.kwargs),
categorical_past_covariates=categorical_past_covariates,
categorical_future_covariates=categorical_future_covariates,
categorical_static_covariates=categorical_static_covariates,
)

def fit(
Expand Down Expand Up @@ -157,7 +180,6 @@ def fit(
**kwargs
Additional kwargs passed to `lightgbm.LGBRegressor.fit()`
"""

if val_series is not None:
kwargs["eval_set"] = self._create_lagged_data(
target_series=val_series,
Expand Down
304 changes: 302 additions & 2 deletions darts/models/forecasting/regression_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
When static covariates are present, they are appended to the lagged features. When multiple time series are passed,
if their static covariates do not have the same size, the shorter ones are padded with 0 valued features.
"""

from collections import OrderedDict
from typing import List, Optional, Sequence, Tuple, Union

Expand All @@ -38,7 +37,12 @@
from darts.timeseries import TimeSeries
from darts.utils.data.tabularization import create_lagged_training_data
from darts.utils.multioutput import MultiOutputRegressor
from darts.utils.utils import _check_quantiles, seq2series, series2seq
from darts.utils.utils import (
_check_quantiles,
get_single_series,
seq2series,
series2seq,
)

logger = get_logger(__name__)

Expand Down Expand Up @@ -1029,3 +1033,299 @@ def _quantile_sampling(self, model_output: np.ndarray) -> np.ndarray:
class _QuantileModelContainer(OrderedDict):
def __init__(self):
super().__init__()


class RegressionModelWithCategoricalCovariates(RegressionModel):
def __init__(
self,
lags: Union[int, list] = None,
lags_past_covariates: Union[int, List[int]] = None,
lags_future_covariates: Union[Tuple[int, int], List[int]] = None,
output_chunk_length: int = 1,
add_encoders: Optional[dict] = None,
model=None,
multi_models: Optional[bool] = True,
categorical_past_covariates: Optional[Union[str, List[str]]] = None,
categorical_future_covariates: Optional[Union[str, List[str]]] = None,
categorical_static_covariates: Optional[Union[str, List[str]]] = None,
):
"""
Extension of `RegressionModel` for regression models that support categorical covariates.

Parameters
----------
lags
Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags
are used (from -1 backward). Otherwise, a list of integers with lags is required (each lag must be < 0).
lags_past_covariates
Number of lagged past_covariates values used to predict the next time step. If an integer is given the last
`lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers
with lags < 0 is required.
lags_future_covariates
Number of lagged future_covariates values used to predict the next time step. If a tuple (past, future) is
given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first
`future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list
of integers with lags is required.
output_chunk_length
Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast
horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may
be useful if the covariates don't extend far enough into the future.
add_encoders
A large number of past and future covariates can be automatically generated with `add_encoders`.
This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
transform the generated covariates. This happens all under one hood and only needs to be specified at
model creation.
Read :meth:`SequentialEncoder <darts.dataprocessing.encoders.SequentialEncoder>` to find out more about
``add_encoders``. Default: ``None``. An example showing some of ``add_encoders`` features:

.. highlight:: python
.. code-block:: python

add_encoders={
'cyclic': {'future': ['month']},
'datetime_attribute': {'future': ['hour', 'dayofweek']},
'position': {'past': ['relative'], 'future': ['relative']},
'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
'transformer': Scaler()
}
..
model
Scikit-learn-like model with ``fit()`` and ``predict()`` methods. Also possible to use model that doesn't
support multi-output regression for multivariate timeseries, in which case one regressor
will be used per component in the multivariate series.
If None, defaults to: ``sklearn.linear_model.LinearRegression(n_jobs=-1)``.
multi_models
If True, a separate model will be trained for each future lag to predict. If False, a single model is
trained to predict at step 'output_chunk_length' in the future. Default: True.
categorical_past_covariates
Optionally, component name or list of component names specifying the past covariates that should be treated
as categorical.
categorical_future_covariates
Optionally, component name or list of component names specifying the future covariates that should be
treated as categorical.
categorical_static_covariates
Optionally, string or list of strings specifying the static covariates that should be treated as
categorical.
"""
super().__init__(
lags=lags,
lags_past_covariates=lags_past_covariates,
lags_future_covariates=lags_future_covariates,
output_chunk_length=output_chunk_length,
add_encoders=add_encoders,
model=model,
multi_models=multi_models,
)
self.categorical_past_covariates = (
[categorical_past_covariates]
if isinstance(categorical_past_covariates, str)
else categorical_past_covariates
)
self.categorical_future_covariates = (
[categorical_future_covariates]
if isinstance(categorical_future_covariates, str)
else categorical_future_covariates
)
self.categorical_static_covariates = (
[categorical_static_covariates]
if isinstance(categorical_static_covariates, str)
else categorical_static_covariates
)

def fit(
self,
series: Union[TimeSeries, Sequence[TimeSeries]],
past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
max_samples_per_ts: Optional[int] = None,
n_jobs_multioutput_wrapper: Optional[int] = None,
**kwargs,
):
self._validate_categorical_covariates(
series=series,
past_covariates=past_covariates,
future_covariates=future_covariates,
)
super().fit(
series=series,
past_covariates=past_covariates,
future_covariates=future_covariates,
max_samples_per_ts=max_samples_per_ts,
n_jobs_multioutput_wrapper=n_jobs_multioutput_wrapper,
**kwargs,
)

@property
def _categorical_fit_param_name(self) -> str:
"""
Returns the name of the parameter of the model's `fit` method that specifies the categorical features.
Can be overridden in subclasses.
"""
return "categorical_feature"

def _validate_categorical_covariates(
self,
series: Union[TimeSeries, Sequence[TimeSeries]],
past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None,
) -> None:
"""
Checks that the categorical covariates are valid. Specifically, checks that the categorical covariates
of the model are a subset of all covariates.

Parameters
----------
series
TimeSeries or Sequence[TimeSeries] object containing the target values.
past_covariates
Optionally, a series or sequence of series specifying past-observed covariates
future_covariates
Optionally, a series or sequence of series specifying future-known covariates
"""
for categorical_covariates, covariates, cov_type in zip(
[self.categorical_past_covariates, self.categorical_future_covariates],
[past_covariates, future_covariates],
["past_covariates", "future_covariates"],
):
if categorical_covariates:
if not covariates:
raise_log(
ValueError(
f"`categorical_{cov_type}` were declared at model creation but no "
f"`{cov_type}` are passed to the `fit()` call."
),
)
s = get_single_series(covariates)
if not set(categorical_covariates).issubset(set(s.components)):
raise_log(
ValueError(
f"Some `categorical_{cov_type}` components "
f"({set(categorical_covariates) - set(s.components)}) "
f"declared at model creation are not present in the `{cov_type}` "
f"passed to the `fit()` call."
)
)
if self.categorical_static_covariates:
s = get_single_series(series)
covariates = s.static_covariates
if not s.has_static_covariates:
raise_log(
ValueError(
"`categorical_static_covariates` were declared at model creation but `series`"
"passed to the `fit()` call does not contain `static_covariates`."
),
)
if not set(self.categorical_static_covariates).issubset(
set(covariates.columns)
):
raise_log(
ValueError(
f"Some `categorical_static_covariates` components "
f"({set(self.categorical_static_covariates) - set(covariates.columns)}) "
f"declared at model creation are not present in the series' `static_covariates` "
f"passed to the `fit()` call."
)
)

def _get_categorical_features(
self,
series: Union[List[TimeSeries], TimeSeries],
past_covariates: Optional[Union[List[TimeSeries], TimeSeries]] = None,
future_covariates: Optional[Union[List[TimeSeries], TimeSeries]] = None,
) -> Tuple[List[int], List[str]]:
"""
Returns the indices and column names of the categorical features in the regression model.

Steps:
1. Get the list of features used in the model. We keep the creation order of the different lags/features
in create_lagged_data.
2. Get the indices of the categorical features in the list of features.
"""

categorical_covariates = (
(
self.categorical_past_covariates
if self.categorical_past_covariates
else []
)
+ (
self.categorical_future_covariates
if self.categorical_future_covariates
else []
)
+ (
self.categorical_static_covariates
if self.categorical_static_covariates
else []
)
)

if not categorical_covariates:
return [], []
else:
target_ts = get_single_series(series)
past_covs_ts = get_single_series(past_covariates)
fut_covs_ts = get_single_series(future_covariates)

# We keep the creation order of the different lags/features in create_lagged_data
feature_list = (
[
f"target_{component}_lag{lag}"
for lag in self.lags.get("target", [])
for component in target_ts.components
]
+ [
f"past_cov_{component}_lag{lag}"
for lag in self.lags.get("past", [])
for component in past_covs_ts.components
]
+ [
f"fut_cov_{component}_lag{lag}"
for lag in self.lags.get("future", [])
for component in fut_covs_ts.components
]
+ (
list(target_ts.static_covariates.columns)
if target_ts.has_static_covariates
# if isinstance(target_ts.static_covariates, pd.DataFrame)
else []
)
)

indices = [
i
for i, col in enumerate(feature_list)
for cat in categorical_covariates
if cat and cat in col
]
col_names = [feature_list[i] for i in indices]

return indices, col_names

def _fit_model(
self,
target_series,
past_covariates,
future_covariates,
max_samples_per_ts,
**kwargs,
):
"""
Custom fit function for `RegressionModelWithCategoricalCovariates` models, adding logic to let the model
handle categorical features directly.
"""
cat_col_indices, _ = self._get_categorical_features(
target_series,
past_covariates,
future_covariates,
)

kwargs[self._categorical_fit_param_name] = cat_col_indices
super()._fit_model(
target_series=target_series,
past_covariates=past_covariates,
future_covariates=future_covariates,
max_samples_per_ts=max_samples_per_ts,
**kwargs,
)
Loading