unit8co · dennisbader · Mar 28, 2023 · Feb 20, 2023 · Feb 20, 2023 · Feb 20, 2023
@@ -6,6 +6,9 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
 
 ## [Unreleased](https://github.com/unit8co/darts/tree/master)
 [Full Changelog](https://github.com/unit8co/darts/compare/0.23.1...master)
+- `LightGBM` model now supports native categorical feature handling as described
+   [here](https://lightgbm.readthedocs.io/en/latest/Features.html#optimal-split-for-categorical-features).
+   [#1585](https://github.com/unit8co/darts/pull/1585) by [Rijk van der Meulen](https://github.com/rijkvandermeulen)
 
 ## [0.23.1](https://github.com/unit8co/darts/tree/0.23.1) (2023-01-12)
 Patch release

@@ -15,7 +15,7 @@
 import lightgbm as lgb
 import numpy as np
 
-from darts.logging import get_logger
+from darts.logging import get_logger, raise_log
 from darts.models.forecasting.regression_model import RegressionModel, _LikelihoodMixin
 from darts.timeseries import TimeSeries
 
@@ -34,6 +34,9 @@ def __init__(
         quantiles: List[float] = None,
         random_state: Optional[int] = None,
         multi_models: Optional[bool] = True,
+        categorical_past_covariates: Optional[List[str]] = None,
+        categorical_future_covariates: Optional[List[str]] = None,
+        categorical_static_covariates: Optional[List[str]] = None,
         **kwargs,
     ):
         """LGBM Model
@@ -87,6 +90,20 @@ def __init__(
         multi_models
             If True, a separate model will be trained for each future lag to predict. If False, a single model is
             trained to predict at step 'output_chunk_length' in the future. Default: True.
+        categorical_past_covariates
+            Optionally, a list of component names specifying the past covariates that should be treated as categorical
+            by the underlying `lightgbm.LightGBMRegressor`. It's recommended that the components that are treated as
+            categorical are integer-encoded. For more information on how LightGBM handles categorical features, visit:
+            `Categorical feature support documentation
+            <https://lightgbm.readthedocs.io/en/latest/Features.html#optimal-split-for-categorical-features>`_
+        categorical_future_covariates
+            Optionally, a list of component names specifying the future covariates that should be treated as categorical
+            by the underlying `lightgbm.LightGBMRegressor`. It's recommended that the components that are treated as
+            categorical are integer-encoded.
+        categorical_static_covariates
+            Optionally, a list of names specifying the static covariates that should be treated as categorical
+            by the underlying `lightgbm.LightGBMRegressor`. It's recommended that the static covariates that are
+            treated as categorical are integer-encoded.
         **kwargs
             Additional keyword arguments passed to `lightgbm.LGBRegressor`.
         """
@@ -97,6 +114,9 @@ def __init__(
         self.quantiles = None
         self.likelihood = likelihood
         self._rng = None
+        self.categorical_past_covariates = categorical_past_covariates
+        self.categorical_future_covariates = categorical_future_covariates
+        self.categorical_static_covariates = categorical_static_covariates
 
         # parse likelihood
         available_likelihoods = ["quantile", "poisson"]  # to be extended
@@ -163,6 +183,43 @@ def fit(
             Additional kwargs passed to `lightgbm.LGBRegressor.fit()`
         """
 
+        # Validate that categorical covariates of the model are a subset of all covariates
+        for categorical_covariates, covariates, cov_type in zip(
+            [self.categorical_past_covariates, self.categorical_future_covariates],
+            [past_covariates, future_covariates],
+            ["past_covariates", "future_covariates"],
+        ):
+            if categorical_covariates:
+                if not covariates:
+                    raise_log(
+                        ValueError(
+                            f"Categorical {cov_type} are declared in the model constructor but no "
+                            f"{cov_type} are passed to the `fit()` call."
+                        ),
+                    )
+                s = covariates if isinstance(covariates, TimeSeries) else covariates[0]
+                if not set(categorical_covariates).issubset(set(s.components)):
+                    raise_log(
+                        ValueError(
+                            f"Some {cov_type} ({set(categorical_covariates) - set(s.components)}) "
+                            f"declared as categorical in the model constructor are not "
+                            f"present in the {cov_type} passed to the `fit()` call."
+                        )
+                    )
+        if self.categorical_static_covariates:
+            s = series if isinstance(series, TimeSeries) else series[0]
+            if not set(self.categorical_static_covariates).issubset(
+                set(s.static_covariates.columns)
+            ):
+                raise_log(
+                    ValueError(
+                        f"Some static covariates "
+                        f"({set(self.categorical_static_covariates) - set(s.static_covariates.columns)}) "
+                        f"declared as categorical in the model constructor are not "
+                        f"present in the series passed to the `fit()` call."
+                    )
+                )
+
         if val_series is not None:
             kwargs["eval_set"] = self._create_lagged_data(
                 target_series=val_series,
@@ -200,6 +257,42 @@ def fit(
 
         return self
 
+    def _fit_model(
+        self,
+        target_series,
+        past_covariates,
+        future_covariates,
+        max_samples_per_ts,
+        **kwargs,
+    ):
+        """
+        Custom fit function for the LightGBM model; adding logic to let the model handle categorical features
+        directly.
+        """
+
+        training_samples, training_labels = self._create_lagged_data(
+            target_series,
+            past_covariates,
+            future_covariates,
+            max_samples_per_ts,
+        )
+
+        cat_cols_indices, _ = self._get_categorical_features(
+            target_series,
+            past_covariates,
+            future_covariates,
+        )
+
+        # if training_labels is of shape (n_samples, 1) flatten it to shape (n_samples,)
+        if len(training_labels.shape) == 2 and training_labels.shape[1] == 1:
+            training_labels = training_labels.ravel()
+        self.model.fit(
+            training_samples,
+            training_labels,
+            categorical_feature=cat_cols_indices,
+            **kwargs,
+        )
+
     def _predict_and_sample(
         self, x: np.ndarray, num_samples: int, **kwargs
     ) -> np.ndarray:

@@ -26,9 +26,8 @@
 When static covariates are present, they are appended to the lagged features. When multiple time series are passed,
 if their static covariates do not have the same size, the shorter ones are padded with 0 valued features.
 """
-
 from collections import OrderedDict
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import List, Optional, Protocol, Sequence, Tuple, Union, runtime_checkable
 
 import numpy as np
 from sklearn.linear_model import LinearRegression
@@ -812,6 +811,86 @@ def _predict_and_sample(
 
         return prediction.reshape(k, self.pred_dim, -1)
 
+    def _get_categorical_features(
+        self,
+        series: Union[List[TimeSeries], TimeSeries],
+        past_covariates: Optional[Union[List[TimeSeries], TimeSeries]] = None,
+        future_covariates: Optional[Union[List[TimeSeries], TimeSeries]] = None,
+    ) -> Tuple[List[int], List[str]]:
+        """
+        Returns the indices and column names of the categorical features in the regression model.
+
+        Steps:
+        1. Get the list of features used in the model. We keep the creation order of the different lags/features
+            in create_lagged_data.
+        2. Get the indices of the categorical features in the list of features.
+        """
+
+        assert isinstance(self, SupportsCategoricalCovariates), (
+            "The `_get_categorical_features` method is only available for RegressionModels that support "
+            "categorical covariates."
+        )
+
+        categorical_covariates = (
+            (
+                self.categorical_past_covariates
+                if self.categorical_past_covariates
+                else []
+            )
+            + (
+                self.categorical_future_covariates
+                if self.categorical_future_covariates
+                else []
+            )
+            + (
+                self.categorical_static_covariates
+                if self.categorical_static_covariates
+                else []
+            )
+        )
+
+        if not categorical_covariates:
+            return [], []
+        else:
+            target_ts = series if isinstance(series, TimeSeries) else series[0]
+            past_covs_ts = past_covariates[0] if past_covariates else None
+            fut_covs_ts = future_covariates[0] if future_covariates else None
+
+            # We keep the creation order of the different lags/features in create_lagged_data
+            feature_list = (
+                [
+                    f"target_{component}_lag{lag}"
+                    for lag in self.lags.get("target", [])
+                    for component in target_ts.components
+                ]
+                + [
+                    f"past_cov_{component}_lag{lag}"
+                    for lag in self.lags.get("past", [])
+                    for component in past_covs_ts.components
+                ]
+                + [
+                    f"fut_cov_{component}_lag{lag}"
+                    for lag in self.lags.get("future", [])
+                    for component in fut_covs_ts.components
+                ]
+                + (
+                    list(target_ts.static_covariates.columns)
+                    if target_ts.has_static_covariates
+                    # if isinstance(target_ts.static_covariates, pd.DataFrame)
+                    else []
+                )
+            )
+
+            indices = [
+                i
+                for i, col in enumerate(feature_list)
+                for cat in categorical_covariates
+                if cat and cat in col
+            ]
+            col_names = [feature_list[i] for i in indices]
+
+            return indices, col_names
+
     def __str__(self):
         return self.model.__str__()
 
@@ -820,6 +899,13 @@ def _supports_static_covariates() -> bool:
         return True
 
 
+@runtime_checkable
+class SupportsCategoricalCovariates(Protocol):
+    categorical_past_covariates: Optional[List[str]]
+    categorical_future_covariates: Optional[List[str]]
+    categorical_static_covariates: Optional[List[str]]
+
+
 class _LikelihoodMixin:
     """
     A class containing functions supporting quantile, poisson and gaussian regression, to be used as a mixin for some