Feat/encoders extension (#1093)

dennisbader · hrzn · web-flow · commit a93296b8890e · 2022-08-10T22:48:34.000+02:00
* added encoders to regression models

* added unit tests for encoders in regression model

* remove torch flavor checks for local forecasting model tests

* reset ptl trainer when loading torch models

* reduced estimators for RandomForest test case

Co-authored-by: Julien Herzen &lt;julien@unit8.co&gt;
diff --git a/darts/models/forecasting/catboost_model.py b/darts/models/forecasting/catboost_model.py
@@ -23,6 +23,7 @@ def __init__(
         lags_past_covariates: Union[int, List[int]] = None,
         lags_future_covariates: Union[Tuple[int, int], List[int]] = None,
         output_chunk_length: int = 1,
+        add_encoders: Optional[dict] = None,
         likelihood: str = None,
         quantiles: List = None,
         random_state: Optional[int] = None,
@@ -48,6 +49,26 @@ def __init__(
             Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast
             horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may
             be useful if the covariates don't extend far enough into the future.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. Default: ``None``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
         likelihood
             Can be set to 'quantile', 'poisson' or 'gaussian'. If set, the model will be probabilistic,
             allowing sampling at prediction time. When set to 'gaussian', the model will use CatBoost's
@@ -96,6 +117,7 @@ def __init__(
             lags_past_covariates=lags_past_covariates,
             lags_future_covariates=lags_future_covariates,
             output_chunk_length=output_chunk_length,
+            add_encoders=add_encoders,
             model=CatBoostRegressor(**kwargs),
         )
 
diff --git a/darts/models/forecasting/ensemble_model.py b/darts/models/forecasting/ensemble_model.py
@@ -114,6 +114,11 @@ def _stack_ts_multiseq(self, predictions_list):
         # stacks multiple sequences of timeseries elementwise
         return [self._stack_ts_seq(ts_list) for ts_list in zip(*predictions_list)]
 
+    def _model_encoder_settings(self):
+        raise NotImplementedError(
+            "Encoders are not supported by EnsembleModels. Instead add encoder to the underlying `models`."
+        )
+
     def _make_multiple_predictions(
         self,
         n: int,
diff --git a/darts/models/forecasting/forecasting_model.py b/darts/models/forecasting/forecasting_model.py
@@ -32,6 +32,7 @@
     _parallel_apply,
     _with_sanity_checks,
 )
+from darts.utils.data.encoders import SequentialEncoder
 from darts.utils.timeseries_generation import (
     _build_forecast_series,
     _generate_new_dates,
@@ -932,6 +933,13 @@ class GlobalForecastingModel(ForecastingModel, ABC):
     _expect_past_covariates, _expect_future_covariates = False, False
     past_covariate_series, future_covariate_series = None, None
 
+    def __init__(self, add_encoders: Optional[dict] = None):
+        super().__init__()
+
+        # by default models do not use encoders
+        self.add_encoders = add_encoders
+        self.encoders: Optional[SequentialEncoder] = None
+
     @abstractmethod
     def fit(
         self,
@@ -1084,6 +1092,33 @@ def _supports_non_retrainable_historical_forecasts(self) -> bool:
         """GlobalForecastingModel supports historical forecasts without retraining the model"""
         return True
 
+    @property
+    @abstractmethod
+    def _model_encoder_settings(self) -> Tuple[int, int, bool, bool]:
+        """Abstract property that returns model specific encoder settings that are used to initialize the encoders.
+
+        Must return Tuple (input_chunk_length, output_chunk_length, takes_past_covariates, takes_future_covariates)
+        """
+        pass
+
+    def initialize_encoders(self) -> SequentialEncoder:
+        """instantiates the SequentialEncoder object based on self._model_encoder_settings and parameter
+        ``add_encoders`` used at model creation"""
+        (
+            input_chunk_length,
+            output_chunk_length,
+            takes_past_covariates,
+            takes_future_covariates,
+        ) = self._model_encoder_settings
+
+        return SequentialEncoder(
+            add_encoders=self.add_encoders,
+            input_chunk_length=input_chunk_length,
+            output_chunk_length=output_chunk_length,
+            takes_past_covariates=takes_past_covariates,
+            takes_future_covariates=takes_future_covariates,
+        )
+
 
 class DualCovariatesForecastingModel(ForecastingModel, ABC):
     """The base class for the forecasting models that are not global, but support future covariates.
diff --git a/darts/models/forecasting/gradient_boosted_model.py b/darts/models/forecasting/gradient_boosted_model.py
@@ -27,6 +27,7 @@ def __init__(
         lags_past_covariates: Union[int, List[int]] = None,
         lags_future_covariates: Union[Tuple[int, int], List[int]] = None,
         output_chunk_length: int = 1,
+        add_encoders: Optional[dict] = None,
         likelihood: str = None,
         quantiles: List[float] = None,
         random_state: Optional[int] = None,
@@ -52,6 +53,26 @@ def __init__(
             Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast
             horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may
             be useful if the covariates don't extend far enough into the future.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. Default: ``None``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
         likelihood
             Can be set to `quantile` or `poisson`. If set, the model will be probabilistic, allowing sampling at
             prediction time.
@@ -87,6 +108,7 @@ def __init__(
             lags_past_covariates=lags_past_covariates,
             lags_future_covariates=lags_future_covariates,
             output_chunk_length=output_chunk_length,
+            add_encoders=add_encoders,
             model=lgb.LGBMRegressor(**kwargs),
         )
 
diff --git a/darts/models/forecasting/linear_regression_model.py b/darts/models/forecasting/linear_regression_model.py
@@ -25,6 +25,7 @@ def __init__(
         lags_past_covariates: Union[int, List[int]] = None,
         lags_future_covariates: Union[Tuple[int, int], List[int]] = None,
         output_chunk_length: int = 1,
+        add_encoders: Optional[dict] = None,
         likelihood: str = None,
         quantiles: List[float] = None,
         random_state: Optional[int] = None,
@@ -50,6 +51,26 @@ def __init__(
             Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast
             horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may
             be useful if the covariates don't extend far enough into the future.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. Default: ``None``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
         likelihood
             Can be set to `quantile` or `poisson`. If set, the model will be probabilistic, allowing sampling at
             prediction time. If set to `quantile`, the `sklearn.linear_model.QuantileRegressor` is used. Similarly, if
@@ -94,6 +115,7 @@ def __init__(
             lags_past_covariates=lags_past_covariates,
             lags_future_covariates=lags_future_covariates,
             output_chunk_length=output_chunk_length,
+            add_encoders=add_encoders,
             model=model,
         )
 
diff --git a/darts/models/forecasting/random_forest.py b/darts/models/forecasting/random_forest.py
@@ -31,6 +31,7 @@ def __init__(
         lags_past_covariates: Union[int, List[int]] = None,
         lags_future_covariates: Union[Tuple[int, int], List[int]] = None,
         output_chunk_length: int = 1,
+        add_encoders: Optional[dict] = None,
         n_estimators: Optional[int] = 100,
         max_depth: Optional[int] = None,
         **kwargs,
@@ -55,6 +56,26 @@ def __init__(
             Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast
             horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may
             be useful if the covariates don't extend far enough into the future.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. Default: ``None``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
         n_estimators : int
             The number of trees in the forest.
         max_depth : int
@@ -74,6 +95,7 @@ def __init__(
             lags_past_covariates=lags_past_covariates,
             lags_future_covariates=lags_future_covariates,
             output_chunk_length=output_chunk_length,
+            add_encoders=add_encoders,
             model=RandomForestRegressor(**kwargs),
         )
 
diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py
@@ -48,6 +48,7 @@ def __init__(
         lags_past_covariates: Union[int, List[int]] = None,
         lags_future_covariates: Union[Tuple[int, int], List[int]] = None,
         output_chunk_length: int = 1,
+        add_encoders: Optional[dict] = None,
         model=None,
     ):
         """Regression Model
@@ -71,14 +72,34 @@ def __init__(
             Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast
             horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may
             be useful if the covariates don't extend far enough into the future.
+        add_encoders
+            A large number of past and future covariates can be automatically generated with `add_encoders`.
+            This can be done by adding multiple pre-defined index encoders and/or custom user-made functions that
+            will be used as index encoders. Additionally, a transformer such as Darts' :class:`Scaler` can be added to
+            transform the generated covariates. This happens all under one hood and only needs to be specified at
+            model creation.
+            Read :meth:`SequentialEncoder <darts.utils.data.encoders.SequentialEncoder>` to find out more about
+            ``add_encoders``. Default: ``None``. An example showing some of ``add_encoders`` features:
+
+            .. highlight:: python
+            .. code-block:: python
+
+                add_encoders={
+                    'cyclic': {'future': ['month']},
+                    'datetime_attribute': {'future': ['hour', 'dayofweek']},
+                    'position': {'past': ['absolute'], 'future': ['relative']},
+                    'custom': {'past': [lambda idx: (idx.year - 1950) / 50]},
+                    'transformer': Scaler()
+                }
+            ..
         model
             Scikit-learn-like model with ``fit()`` and ``predict()`` methods. Also possible to use model that doesn't
             support multi-output regression for multivariate timeseries, in which case one regressor
             will be used per component in the multivariate series.
             If None, defaults to: ``sklearn.linear_model.LinearRegression(n_jobs=-1)``.
         """
 
-        super().__init__()
+        super().__init__(add_encoders=add_encoders)
 
         self.model = model
         self.lags = {}
@@ -200,6 +221,46 @@ def __init__(
         )
         self.output_chunk_length = output_chunk_length
 
+    @property
+    def _model_encoder_settings(self) -> Tuple[int, int, bool, bool]:
+        lags_covariates = {
+            lag for key in ["past", "future"] for lag in self.lags.get(key, [])
+        }
+        if lags_covariates:
+            # for lags < 0 we need to take `n` steps backwards from past and/or historic future covariates
+            # for minimum lag = -1 -> steps_back_inclusive = 1
+            # inclusive means n steps back including the end of the target series
+            n_steps_back_inclusive = abs(min(min(lags_covariates), 0))
+            # for lags >= 0 we need to take `n` steps ahead from future covariates
+            # for maximum lag = 0 -> output_chunk_length = 1
+            # exclusive means n steps ahead after the last step of the target series
+            n_steps_ahead_exclusive = max(max(lags_covariates), 0) + 1
+            takes_past_covariates = "past" in self.lags
+            takes_future_covariates = "future" in self.lags
+        else:
+            n_steps_back_inclusive = 0
+            n_steps_ahead_exclusive = 0
+            takes_past_covariates = False
+            takes_future_covariates = False
+        return (
+            n_steps_back_inclusive,
+            n_steps_ahead_exclusive,
+            takes_past_covariates,
+            takes_future_covariates,
+        )
+
+    def _get_encoders_n(self, n):
+        """Returns the `n` encoder prediction steps specific to RegressionModels.
+        This will generate slightly more past covariates than the minimum requirement when using past and future
+        covariate lags simultaneously. This is because encoders were written for TorchForecastingModels where we only
+        needed `n` future covariates. For RegressionModel we need `n + max_future_lag`
+        """
+        _, n_steps_ahead, _, takes_future_covariates = self._model_encoder_settings
+        if not takes_future_covariates:
+            return n
+        else:
+            return n + (n_steps_ahead - 1)
+
     @property
     def min_train_series_length(self) -> int:
         return max(
@@ -319,6 +380,7 @@ def _fit_model(
         Function that fit the model. Deriving classes can override this method for adding additional parameters (e.g.,
         adding validation data), keeping the sanity checks on series performed by fit().
         """
+
         training_samples, training_labels = self._create_lagged_data(
             target_series, past_covariates, future_covariates, max_samples_per_ts
         )
@@ -361,6 +423,15 @@ def fit(
         **kwargs
             Additional keyword arguments passed to the `fit` method of the model.
         """
+
+        self.encoders = self.initialize_encoders()
+        if self.encoders.encoding_available:
+            past_covariates, future_covariates = self.encoders.encode_train(
+                target=series,
+                past_covariate=past_covariates,
+                future_covariate=future_covariates,
+            )
+
         super().fit(
             series=series,
             past_covariates=past_covariates,
@@ -477,6 +548,14 @@ def predict(
             logger,
         )
 
+        if self.encoders.encoding_available:
+            past_covariates, future_covariates = self.encoders.encode_inference(
+                n=self._get_encoders_n(n),
+                target=series,
+                past_covariate=past_covariates,
+                future_covariate=future_covariates,
+            )
+
         super().predict(n, series, past_covariates, future_covariates, num_samples)
 
         if series is None:
diff --git a/darts/models/forecasting/torch_forecasting_model.py b/darts/models/forecasting/torch_forecasting_model.py
diff --git a/darts/tests/models/forecasting/test_local_forecasting_models.py b/darts/tests/models/forecasting/test_local_forecasting_models.py
diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py