From e7efa59150419c43c334b4b7d495dc1cd8017aa2 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Tue, 28 Mar 2023 20:56:51 +0200 Subject: [PATCH 01/12] feat: create and store the lagged features names in the regression models --- darts/models/forecasting/regression_model.py | 74 ++++++++++++++- darts/utils/data/tabularization.py | 97 +++++++++++++++++++- 2 files changed, 168 insertions(+), 3 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 34ff056e0a..fe5a66bf4e 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -35,7 +35,10 @@ from darts.logging import get_logger, raise_if, raise_if_not, raise_log from darts.models.forecasting.forecasting_model import GlobalForecastingModel from darts.timeseries import TimeSeries -from darts.utils.data.tabularization import create_lagged_training_data +from darts.utils.data.tabularization import ( + create_lagged_components_names, + create_lagged_training_data, +) from darts.utils.multioutput import MultiOutputRegressor from darts.utils.utils import ( _check_quantiles, @@ -356,6 +359,32 @@ def _create_lagged_data( return training_samples, training_labels + def _create_lagged_components_name( + self, target_series, past_covariates, future_covariates + ): + lags = self.lags.get("target") + lags_past_covariates = self.lags.get("past") + lags_future_covariates = self.lags.get("future") + + features_cols_name, labels_cols_name = create_lagged_components_names( + target_series=target_series, + past_covariates=past_covariates, + future_covariates=future_covariates, + lags=lags, + lags_past_covariates=lags_past_covariates, + lags_future_covariates=lags_future_covariates, + output_chunk_length=self.output_chunk_length, + concatenate=False, + ) + + # adding the static covariates on the right of each features_cols_name + features_cols_name = self._add_static_covariates_name( + features_cols_name, + target_series, + ) + + return features_cols_name, labels_cols_name + def _add_static_covariates( self, features: Union[np.array, Sequence[np.array]], @@ -443,6 +472,41 @@ def _add_static_covariates( features = features[0] return features + def _add_static_covariates_name( + self, + features_cols_name: List[List[str]], + target_series: Union[TimeSeries, Sequence[TimeSeries]], + ) -> Union[np.array, Sequence[np.array]]: + """ + Add static covariates names to the features name for RegressionModels. + Accounts for series with potentially different static covariates to accomodate for the maximum + number of available static_covariates in any of the given series in the sequence. + + Parameters + ---------- + features_cols_name + The name of the features of the numpy array(s) to which the static covariates will be added, generated with + `create_lagged_components_names()` + target_series + The target series from which to read the static covariates. + + Returns + ------- + features_cols_name + The features' name list with appended static covariates names on the right. + """ + target_series = series2seq(target_series) + + # collect static covariates info + static_covs_names = [] + for ts in target_series: + if ts.has_static_covariates: + static_covs_names += list(ts.static_covariates.keys()) + + return [ + feat_cols_name + static_covs_names for feat_cols_name in features_cols_name + ] + def _fit_model( self, target_series, @@ -468,6 +532,14 @@ def _fit_model( training_labels = training_labels.ravel() self.model.fit(training_samples, training_labels, **kwargs) + # generate and store the lagged components names (for feature importance analysis) + lagged_features_names, _ = self._create_lagged_components_name( + target_series=target_series, + past_covariates=past_covariates, + future_covariates=future_covariates, + ) + self.model.lagged_features_name_ = lagged_features_names + def fit( self, series: Union[TimeSeries, Sequence[TimeSeries]], diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index 2cb7fb48e8..a4c8789ea6 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -1,7 +1,8 @@ import warnings from functools import reduce +from itertools import zip_longest from math import inf -from typing import Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Tuple, Union try: from typing import Literal @@ -69,7 +70,8 @@ def create_lagged_data( The shape of `X` is: `X.shape = (n_observations, n_lagged_features, n_samples)`, where `n_observations` equals either the number of time points shared between all specified series, - or `max_samples_per_ts`, whichever is smallest. The shape of `y` is: + or `max_samples_per_ts`, whichever is smallest. + The shape of `y` is: `y.shape = (n_observations, output_chunk_length, n_samples)`, if `multi_models = True`, otherwise: `y.shape = (n_observations, 1, n_samples)`. @@ -227,6 +229,11 @@ def create_lagged_data( ---------- .. [1] https://otexts.com/fpp2/AR.html#AR .. [2] https://unit8.com/resources/time-series-forecasting-using-past-and-future-external-data-with-darts/ + + See Also + -------- + tabularization.create_lagged_components_names : return the lagged features names as a list of strings. + """ raise_if( is_training and (target_series is None), @@ -527,6 +534,92 @@ def create_lagged_prediction_data( return X, times +def create_lagged_components_names( + target_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, + past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, + future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, + lags: Optional[Sequence[int]] = None, + lags_past_covariates: Optional[Sequence[int]] = None, + lags_future_covariates: Optional[Sequence[int]] = None, + output_chunk_length: int = 1, + concatenate: bool = True, +) -> Tuple[List[List[str]], List[List[str]]]: + """ + Helper function called to retrieve the name of the features and labels arrays created with + `create_lagged_data()`. The convention are the following: + + Along the `n_lagged_features` axis, `X` has the following structure (for `*_lags=[-2,-1]` and + `*_series.n_components = 2`): + lagged_target | lagged_past_covariates | lagged_future_covariates + where each `lagged_*` has the following structure: + lag_-2_comp_1_* | lag_-2_comp_2_* | lag_-1_comp_1_* | lag_-1_comp_2_* + + Along the `n_lagged_labels` axis, `y` has the following structure (for `output_chunk_length=4` and + `target_series.n_components=2`): + lag_+0_comp_1_target | lag_+0_comp_2_target | ... | lag_+3_comp_1_target | lag_+3_comp_2_target + + Returns + ------- + features_cols_name + The names of the lagged features in the `X` array generated by `create_lagged_data()`. + `features_cols_name` always returned as a `List[List[str]]` and if `concatenate=True`, + each entry also contains the columns names for the `y` array (on the right). + labels_cols_name + The names of the lagged features in the `y` array generated by `create_lagged_data()`. + `labels_cols_name` always returned as a `List[List[str]]` + + See Also + -------- + tabularization.create_lagged_data : generate the lagged features and labels as (list of) Arrays. + """ + target_series = ( + [target_series] if not isinstance(target_series, List) else target_series + ) + past_covariates = ( + [past_covariates] if not isinstance(past_covariates, List) else past_covariates + ) + future_covariates = ( + [future_covariates] + if not isinstance(future_covariates, List) + else future_covariates + ) + + features_cols_name = [] + labels_cols_name = [] + for target_ts, past_cov_ts, future_cov_ts in zip_longest( + target_series, past_covariates, future_covariates, fillvalue=None + ): + covariates_specs = [] + if target_ts and lags: + covariates_specs.append((target_ts.components, lags)) + if past_cov_ts and lags_past_covariates: + covariates_specs.append((past_cov_ts.components, lags_past_covariates)) + if future_cov_ts and lags_future_covariates: + covariates_specs.append((future_cov_ts.components, lags_future_covariates)) + + features_cols_name.append( + [ + f"lag_{lag_idx}_{comp_name}" + for variate_components, variates_lags in covariates_specs + for lag_idx in variates_lags + for comp_name in variate_components + ] + ) + + labels_cols_name.append( + [ + f"lag_{lag_idx}_{comp_name}" + for lag_idx in range(0, output_chunk_length) + for comp_name in target_ts.components + ] + ) + + if concatenate: + features_cols_name[-1] += labels_cols_name[-1] + + return features_cols_name, labels_cols_name + + def _create_lagged_data_by_moving_window( target_series: TimeSeries, output_chunk_length: int, From cc77936eaa11194127a98a6e7d6509c0b5f8bf63 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Tue, 28 Mar 2023 20:57:32 +0200 Subject: [PATCH 02/12] feat: adding corresponding tests in tabularization --- .../test_create_lagged_training_data.py | 146 +++++++++++++++++- 1 file changed, 144 insertions(+), 2 deletions(-) diff --git a/darts/tests/utils/tabularization/test_create_lagged_training_data.py b/darts/tests/utils/tabularization/test_create_lagged_training_data.py index 03efadf2cc..32f3e4ba54 100644 --- a/darts/tests/utils/tabularization/test_create_lagged_training_data.py +++ b/darts/tests/utils/tabularization/test_create_lagged_training_data.py @@ -9,7 +9,10 @@ from darts import concatenate as darts_concatenate from darts.logging import get_logger, raise_if, raise_if_not, raise_log from darts.tests.base_test_class import DartsBaseTestClass -from darts.utils.data.tabularization import create_lagged_training_data +from darts.utils.data.tabularization import ( + create_lagged_components_names, + create_lagged_training_data, +) from darts.utils.timeseries_generation import linear_timeseries @@ -49,7 +52,7 @@ def create_multivariate_linear_timeseries( timeseries = [] for i in range(n_components): # Values of each component is 1 larger than the last: - timeseries_i = linear_timeseries(**kwargs) + i + timeseries_i = linear_timeseries(column_name=f"lin_ts_{i}", **kwargs) + i timeseries.append(timeseries_i) return darts_concatenate(timeseries, axis=1) @@ -1795,3 +1798,142 @@ def test_lagged_training_data_unspecified_lag_or_series_warning(self): use_moving_windows=use_moving_windows, ) self.assertEqual(len(w), 0) + + def test_create_lagged_components_names(self): + """ + Tests that `create_lagged_components_names` produces the expected features name depending + on the lags, output_chunk_length and covariates. + """ + target_with_no_cov = self.create_multivariate_linear_timeseries( + n_components=1, start_value=0, end_value=10, start=2, length=10, freq=2 + ) + + target_with_cov = self.create_multivariate_linear_timeseries( + n_components=2, start_value=0, end_value=10, start=2, length=10, freq=2 + ) + target_with_cov = target_with_cov.with_static_covariates( + pd.Series([1], index=["dummy_static_cov"]) + ) + + past = self.create_multivariate_linear_timeseries( + n_components=3, start_value=10, end_value=20, start=2, length=10, freq=2 + ) + future = self.create_multivariate_linear_timeseries( + n_components=4, start_value=20, end_value=30, start=2, length=10, freq=2 + ) + + # target no static covariate + expected_lagged_features = ["lag_-2_lin_ts_0", "lag_-1_lin_ts_0"] + created_lagged_features, _ = create_lagged_components_names( + target_series=target_with_no_cov, + past_covariates=None, + future_covariates=None, + lags=[-2, -1], + lags_past_covariates=None, + lags_future_covariates=None, + concatenate=False, + ) + self.assertEqual(expected_lagged_features, created_lagged_features[0]) + + # target with static covariate (not handled by this function) + expected_lagged_features = [ + "lag_-4_lin_ts_0", + "lag_-4_lin_ts_1", + "lag_-1_lin_ts_0", + "lag_-1_lin_ts_1", + ] + created_lagged_features, _ = create_lagged_components_names( + target_series=target_with_cov, + past_covariates=None, + future_covariates=None, + lags=[-4, -1], + lags_past_covariates=None, + lags_future_covariates=None, + concatenate=False, + ) + self.assertEqual(expected_lagged_features, created_lagged_features[0]) + + # target + past + expected_lagged_features = [ + "lag_-4_lin_ts_0", + "lag_-3_lin_ts_0", + "lag_-1_lin_ts_0", + "lag_-1_lin_ts_1", + "lag_-1_lin_ts_2", + ] + created_lagged_features, _ = create_lagged_components_names( + target_series=target_with_no_cov, + past_covariates=past, + future_covariates=None, + lags=[-4, -3], + lags_past_covariates=[-1], + lags_future_covariates=None, + concatenate=False, + ) + self.assertEqual(expected_lagged_features, created_lagged_features[0]) + + # target + future + expected_lagged_features = [ + "lag_-2_lin_ts_0", + "lag_-1_lin_ts_0", + "lag_3_lin_ts_0", + "lag_3_lin_ts_1", + "lag_3_lin_ts_2", + "lag_3_lin_ts_3", + ] + created_lagged_features, _ = create_lagged_components_names( + target_series=target_with_no_cov, + past_covariates=None, + future_covariates=future, + lags=[-2, -1], + lags_past_covariates=None, + lags_future_covariates=[3], + concatenate=False, + ) + self.assertEqual(expected_lagged_features, created_lagged_features[0]) + + # past + future + expected_lagged_features = [ + "lag_-1_lin_ts_0", + "lag_-1_lin_ts_1", + "lag_-1_lin_ts_2", + "lag_2_lin_ts_0", + "lag_2_lin_ts_1", + "lag_2_lin_ts_2", + "lag_2_lin_ts_3", + ] + created_lagged_features, _ = create_lagged_components_names( + target_series=target_with_no_cov, + past_covariates=past, + future_covariates=future, + lags=None, + lags_past_covariates=[-1], + lags_future_covariates=[2], + concatenate=False, + ) + self.assertEqual(expected_lagged_features, created_lagged_features[0]) + + # target with static + past + future + expected_lagged_features = [ + "lag_-2_lin_ts_0", + "lag_-2_lin_ts_1", + "lag_-1_lin_ts_0", + "lag_-1_lin_ts_1", + "lag_-1_lin_ts_0", + "lag_-1_lin_ts_1", + "lag_-1_lin_ts_2", + "lag_2_lin_ts_0", + "lag_2_lin_ts_1", + "lag_2_lin_ts_2", + "lag_2_lin_ts_3", + ] + created_lagged_features, _ = create_lagged_components_names( + target_series=target_with_cov, + past_covariates=past, + future_covariates=future, + lags=[-2, -1], + lags_past_covariates=[-1], + lags_future_covariates=[2], + concatenate=False, + ) + self.assertEqual(expected_lagged_features, created_lagged_features[0]) From dbc942b4047acf247ecfd1d102fcd0a29eee8590 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Wed, 29 Mar 2023 11:50:13 +0200 Subject: [PATCH 03/12] fix: support any kind of Sequence to generate the lagged features name --- darts/utils/data/tabularization.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index a4c8789ea6..af099ba31f 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -573,14 +573,16 @@ def create_lagged_components_names( tabularization.create_lagged_data : generate the lagged features and labels as (list of) Arrays. """ target_series = ( - [target_series] if not isinstance(target_series, List) else target_series + [target_series] if not isinstance(target_series, Sequence) else target_series ) past_covariates = ( - [past_covariates] if not isinstance(past_covariates, List) else past_covariates + [past_covariates] + if not isinstance(past_covariates, Sequence) + else past_covariates ) future_covariates = ( [future_covariates] - if not isinstance(future_covariates, List) + if not isinstance(future_covariates, Sequence) else future_covariates ) From 10060b7e2037da49edf449e84b192a307558aa74 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Wed, 29 Mar 2023 11:56:08 +0200 Subject: [PATCH 04/12] feat: verify that the number of lagged feature names matches the feature_importances in the relevant regression models --- darts/models/forecasting/regression_model.py | 6 ++- .../forecasting/test_regression_models.py | 37 ++++++++++++++++++- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index fe5a66bf4e..8b570fa2d5 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -497,11 +497,13 @@ def _add_static_covariates_name( """ target_series = series2seq(target_series) - # collect static covariates info + # collect static covariates info, preserve the order static_covs_names = [] for ts in target_series: if ts.has_static_covariates: - static_covs_names += list(ts.static_covariates.keys()) + for static_cov_name in ts.static_covariates.keys(): + if static_cov_name not in static_covs_names: + static_covs_names.append(static_cov_name) return [ feat_cols_name + static_covs_names for feat_cols_name in features_cols_name diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 501704c56d..5b89df9c0a 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -960,7 +960,7 @@ def test_static_cov_appended_values(self): ) # Take only last sample: expected_X_pred = expected_X_pred[-1, :].reshape(1, -1) - # Number of static covss to add = difference in width between feature + # Number of static covs to add = difference in width between feature # matrix *with* static covs and feature matrix *without* static covs: scov_width = expected_X.shape[1] - expected_X_pred.shape[1] zeros_scovs = np.zeros((1, scov_width)) @@ -1156,6 +1156,21 @@ def test_static_cov_accuracy(self): pred_no_static_cov = model_no_static_cov.predict( n=period, series=fitting_series ) + expected_features_in = [ + [f"lag_{str(-i)}_smooth" for i in range(period // 2, 0, -1)], + [f"lag_{str(-i)}_irregular" for i in range(period // 2, 0, -1)], + ] + self.assertEqual( + model_no_static_cov.model.lagged_features_name_, expected_features_in + ) + self.assertEqual( + len(model_no_static_cov.model.feature_importances_), + len(expected_features_in[0]), + ) + self.assertEqual( + len(model_no_static_cov.model.feature_importances_), + len(expected_features_in[1]), + ) fitting_series = [ train_series_static_cov[0][: (60 - period)], @@ -1163,6 +1178,26 @@ def test_static_cov_accuracy(self): ] model_static_cov = RandomForest(lags=period // 2, bootstrap=False) model_static_cov.fit(fitting_series) + + series_static_cov_names = ["curve_type"] + expected_features_in = [ + [f"lag_{str(-i)}_smooth" for i in range(period // 2, 0, -1)] + + series_static_cov_names, + [f"lag_{str(-i)}_irregular" for i in range(period // 2, 0, -1)] + + series_static_cov_names, + ] + self.assertEqual( + model_static_cov.model.lagged_features_name_, expected_features_in + ) + self.assertEqual( + len(model_static_cov.model.feature_importances_), + len(expected_features_in[0]), + ) + self.assertEqual( + len(model_static_cov.model.feature_importances_), + len(expected_features_in[1]), + ) + pred_static_cov = model_static_cov.predict(n=period, series=fitting_series) # then From 69a38191cdc354a5d13800b46abe75780426c381 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Wed, 5 Apr 2023 16:02:03 +0200 Subject: [PATCH 05/12] fix: if any of the variate is a sequence of ts with different components names, create generic name for the corresponding variate, updated the tests --- darts/models/forecasting/regression_model.py | 4 +- .../forecasting/test_regression_models.py | 28 +-- .../test_create_lagged_training_data.py | 179 +++++++++++++----- darts/utils/data/tabularization.py | 83 ++++---- 4 files changed, 189 insertions(+), 105 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 8b570fa2d5..ff4944d556 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -505,9 +505,7 @@ def _add_static_covariates_name( if static_cov_name not in static_covs_names: static_covs_names.append(static_cov_name) - return [ - feat_cols_name + static_covs_names for feat_cols_name in features_cols_name - ] + return features_cols_name + static_covs_names def _fit_model( self, diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 5b89df9c0a..0a8503aae4 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -1156,20 +1156,18 @@ def test_static_cov_accuracy(self): pred_no_static_cov = model_no_static_cov.predict( n=period, series=fitting_series ) + # multiple series with different components names ("smooth" and "irregular"), + # triggers creation of generic feature names expected_features_in = [ - [f"lag_{str(-i)}_smooth" for i in range(period // 2, 0, -1)], - [f"lag_{str(-i)}_irregular" for i in range(period // 2, 0, -1)], + f"lag_{str(-i)}_target_0" for i in range(period // 2, 0, -1) ] + self.assertEqual( model_no_static_cov.model.lagged_features_name_, expected_features_in ) self.assertEqual( len(model_no_static_cov.model.feature_importances_), - len(expected_features_in[0]), - ) - self.assertEqual( - len(model_no_static_cov.model.feature_importances_), - len(expected_features_in[1]), + len(expected_features_in), ) fitting_series = [ @@ -1179,23 +1177,17 @@ def test_static_cov_accuracy(self): model_static_cov = RandomForest(lags=period // 2, bootstrap=False) model_static_cov.fit(fitting_series) - series_static_cov_names = ["curve_type"] + # multiple univariates series with different names with same static cov expected_features_in = [ - [f"lag_{str(-i)}_smooth" for i in range(period // 2, 0, -1)] - + series_static_cov_names, - [f"lag_{str(-i)}_irregular" for i in range(period // 2, 0, -1)] - + series_static_cov_names, - ] + f"lag_{str(-i)}_target_0" for i in range(period // 2, 0, -1) + ] + ["curve_type"] + self.assertEqual( model_static_cov.model.lagged_features_name_, expected_features_in ) self.assertEqual( len(model_static_cov.model.feature_importances_), - len(expected_features_in[0]), - ) - self.assertEqual( - len(model_static_cov.model.feature_importances_), - len(expected_features_in[1]), + len(expected_features_in), ) pred_static_cov = model_static_cov.predict(n=period, series=fitting_series) diff --git a/darts/tests/utils/tabularization/test_create_lagged_training_data.py b/darts/tests/utils/tabularization/test_create_lagged_training_data.py index 32f3e4ba54..e4a2c95000 100644 --- a/darts/tests/utils/tabularization/test_create_lagged_training_data.py +++ b/darts/tests/utils/tabularization/test_create_lagged_training_data.py @@ -41,7 +41,7 @@ class CreateLaggedTrainingDataTestCase(DartsBaseTestClass): @staticmethod def create_multivariate_linear_timeseries( - n_components: int, **kwargs + n_components: int, components_names: Sequence[str] = None, **kwargs ) -> TimeSeries: """ Helper function that creates a `linear_timeseries` with a specified number of @@ -50,9 +50,13 @@ def create_multivariate_linear_timeseries( to `linear_timeseries` (`start_value`, `end_value`, `start`, `end`, `length`, etc). """ timeseries = [] + if components_names is None or len(components_names) < n_components: + components_names = [f"lin_ts_{i}" for i in range(n_components)] for i in range(n_components): # Values of each component is 1 larger than the last: - timeseries_i = linear_timeseries(column_name=f"lin_ts_{i}", **kwargs) + i + timeseries_i = ( + linear_timeseries(column_name=components_names[i], **kwargs) + i + ) timeseries.append(timeseries_i) return darts_concatenate(timeseries, axis=1) @@ -1805,25 +1809,49 @@ def test_create_lagged_components_names(self): on the lags, output_chunk_length and covariates. """ target_with_no_cov = self.create_multivariate_linear_timeseries( - n_components=1, start_value=0, end_value=10, start=2, length=10, freq=2 + n_components=1, + components_names=["no_static"], + start_value=0, + end_value=10, + start=2, + length=10, + freq=2, ) - target_with_cov = self.create_multivariate_linear_timeseries( - n_components=2, start_value=0, end_value=10, start=2, length=10, freq=2 + target_with_static_cov = self.create_multivariate_linear_timeseries( + n_components=2, + components_names=["static_0", "static_1"], + start_value=0, + end_value=10, + start=2, + length=10, + freq=2, ) - target_with_cov = target_with_cov.with_static_covariates( + target_with_static_cov = target_with_static_cov.with_static_covariates( pd.Series([1], index=["dummy_static_cov"]) ) past = self.create_multivariate_linear_timeseries( - n_components=3, start_value=10, end_value=20, start=2, length=10, freq=2 + n_components=3, + components_names=["past_0", "past_1", "past_2"], + start_value=10, + end_value=20, + start=2, + length=10, + freq=2, ) future = self.create_multivariate_linear_timeseries( - n_components=4, start_value=20, end_value=30, start=2, length=10, freq=2 + n_components=4, + components_names=["future_0", "future_1", "future_2", "future_3"], + start_value=20, + end_value=30, + start=2, + length=10, + freq=2, ) # target no static covariate - expected_lagged_features = ["lag_-2_lin_ts_0", "lag_-1_lin_ts_0"] + expected_lagged_features = ["lag_-2_no_static", "lag_-1_no_static"] created_lagged_features, _ = create_lagged_components_names( target_series=target_with_no_cov, past_covariates=None, @@ -1833,17 +1861,17 @@ def test_create_lagged_components_names(self): lags_future_covariates=None, concatenate=False, ) - self.assertEqual(expected_lagged_features, created_lagged_features[0]) + self.assertEqual(expected_lagged_features, created_lagged_features) # target with static covariate (not handled by this function) expected_lagged_features = [ - "lag_-4_lin_ts_0", - "lag_-4_lin_ts_1", - "lag_-1_lin_ts_0", - "lag_-1_lin_ts_1", + "lag_-4_static_0", + "lag_-4_static_1", + "lag_-1_static_0", + "lag_-1_static_1", ] created_lagged_features, _ = create_lagged_components_names( - target_series=target_with_cov, + target_series=target_with_static_cov, past_covariates=None, future_covariates=None, lags=[-4, -1], @@ -1851,15 +1879,15 @@ def test_create_lagged_components_names(self): lags_future_covariates=None, concatenate=False, ) - self.assertEqual(expected_lagged_features, created_lagged_features[0]) + self.assertEqual(expected_lagged_features, created_lagged_features) # target + past expected_lagged_features = [ - "lag_-4_lin_ts_0", - "lag_-3_lin_ts_0", - "lag_-1_lin_ts_0", - "lag_-1_lin_ts_1", - "lag_-1_lin_ts_2", + "lag_-4_no_static", + "lag_-3_no_static", + "lag_-1_past_0", + "lag_-1_past_1", + "lag_-1_past_2", ] created_lagged_features, _ = create_lagged_components_names( target_series=target_with_no_cov, @@ -1870,16 +1898,16 @@ def test_create_lagged_components_names(self): lags_future_covariates=None, concatenate=False, ) - self.assertEqual(expected_lagged_features, created_lagged_features[0]) + self.assertEqual(expected_lagged_features, created_lagged_features) # target + future expected_lagged_features = [ - "lag_-2_lin_ts_0", - "lag_-1_lin_ts_0", - "lag_3_lin_ts_0", - "lag_3_lin_ts_1", - "lag_3_lin_ts_2", - "lag_3_lin_ts_3", + "lag_-2_no_static", + "lag_-1_no_static", + "lag_3_future_0", + "lag_3_future_1", + "lag_3_future_2", + "lag_3_future_3", ] created_lagged_features, _ = create_lagged_components_names( target_series=target_with_no_cov, @@ -1890,17 +1918,17 @@ def test_create_lagged_components_names(self): lags_future_covariates=[3], concatenate=False, ) - self.assertEqual(expected_lagged_features, created_lagged_features[0]) + self.assertEqual(expected_lagged_features, created_lagged_features) # past + future expected_lagged_features = [ - "lag_-1_lin_ts_0", - "lag_-1_lin_ts_1", - "lag_-1_lin_ts_2", - "lag_2_lin_ts_0", - "lag_2_lin_ts_1", - "lag_2_lin_ts_2", - "lag_2_lin_ts_3", + "lag_-1_past_0", + "lag_-1_past_1", + "lag_-1_past_2", + "lag_2_future_0", + "lag_2_future_1", + "lag_2_future_2", + "lag_2_future_3", ] created_lagged_features, _ = create_lagged_components_names( target_series=target_with_no_cov, @@ -1911,24 +1939,24 @@ def test_create_lagged_components_names(self): lags_future_covariates=[2], concatenate=False, ) - self.assertEqual(expected_lagged_features, created_lagged_features[0]) + self.assertEqual(expected_lagged_features, created_lagged_features) # target with static + past + future expected_lagged_features = [ - "lag_-2_lin_ts_0", - "lag_-2_lin_ts_1", - "lag_-1_lin_ts_0", - "lag_-1_lin_ts_1", - "lag_-1_lin_ts_0", - "lag_-1_lin_ts_1", - "lag_-1_lin_ts_2", - "lag_2_lin_ts_0", - "lag_2_lin_ts_1", - "lag_2_lin_ts_2", - "lag_2_lin_ts_3", + "lag_-2_static_0", + "lag_-2_static_1", + "lag_-1_static_0", + "lag_-1_static_1", + "lag_-1_past_0", + "lag_-1_past_1", + "lag_-1_past_2", + "lag_2_future_0", + "lag_2_future_1", + "lag_2_future_2", + "lag_2_future_3", ] created_lagged_features, _ = create_lagged_components_names( - target_series=target_with_cov, + target_series=target_with_static_cov, past_covariates=past, future_covariates=future, lags=[-2, -1], @@ -1936,4 +1964,55 @@ def test_create_lagged_components_names(self): lags_future_covariates=[2], concatenate=False, ) - self.assertEqual(expected_lagged_features, created_lagged_features[0]) + self.assertEqual(expected_lagged_features, created_lagged_features) + + # multiple series with same components, including past/future covariates + expected_lagged_features = [ + "lag_-3_static_0", + "lag_-3_static_1", + "lag_-1_past_0", + "lag_-1_past_1", + "lag_-1_past_2", + "lag_2_future_0", + "lag_2_future_1", + "lag_2_future_2", + "lag_2_future_3", + ] + created_lagged_features, _ = create_lagged_components_names( + target_series=[target_with_static_cov, target_with_static_cov], + past_covariates=[past, past], + future_covariates=[future, future], + lags=[-3], + lags_past_covariates=[-1], + lags_future_covariates=[2], + concatenate=False, + ) + self.assertEqual(expected_lagged_features, created_lagged_features) + + # multiple series with different components, same past, different future + expected_lagged_features = [ + "lag_-2_target_0", + "lag_-2_target_1", + "lag_-1_target_0", + "lag_-1_target_1", + "lag_-1_past_0", + "lag_-1_past_1", + "lag_-1_past_2", + "lag_2_future_cov_0", + "lag_2_future_cov_1", + "lag_2_future_cov_2", + "lag_2_future_cov_3", + ] + created_lagged_features, _ = create_lagged_components_names( + target_series=[ + target_with_static_cov, + target_with_no_cov.stack(target_with_no_cov), + ], + past_covariates=[past, past], + future_covariates=[future, past.stack(target_with_no_cov)], + lags=[-2, -1], + lags_past_covariates=[-1], + lags_future_covariates=[2], + concatenate=False, + ) + self.assertEqual(expected_lagged_features, created_lagged_features) diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index af099ba31f..9c9d01f473 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -1,6 +1,6 @@ import warnings from functools import reduce -from itertools import zip_longest +from itertools import chain from math import inf from typing import List, Optional, Sequence, Tuple, Union @@ -558,15 +558,18 @@ def create_lagged_components_names( `target_series.n_components=2`): lag_+0_comp_1_target | lag_+0_comp_2_target | ... | lag_+3_comp_1_target | lag_+3_comp_2_target + Note : if `target_series`, `past_covariates` or `future_covariates` contain series with different + components name, generic feature names will be created (independently for each variate). + Returns ------- features_cols_name - The names of the lagged features in the `X` array generated by `create_lagged_data()`. - `features_cols_name` always returned as a `List[List[str]]` and if `concatenate=True`, - each entry also contains the columns names for the `y` array (on the right). + The names of the lagged features in the `X` array generated by `create_lagged_data()` + as a `List[str]`. If `concatenate=True`, also contains the columns names for + the `y` array (on the right). labels_cols_name - The names of the lagged features in the `y` array generated by `create_lagged_data()`. - `labels_cols_name` always returned as a `List[List[str]]` + The names of the lagged features in the `y` array generated by `create_lagged_data()` + as a `List[str]`. See Also -------- @@ -586,38 +589,50 @@ def create_lagged_components_names( else future_covariates ) - features_cols_name = [] - labels_cols_name = [] - for target_ts, past_cov_ts, future_cov_ts in zip_longest( - target_series, past_covariates, future_covariates, fillvalue=None + covariates_specs = [] + for variate, variate_lags, prefix in zip( + [target_series, past_covariates, future_covariates], + [lags, lags_past_covariates, lags_future_covariates], + ["target", "past_cov", "future_cov"], ): - covariates_specs = [] - if target_ts and lags: - covariates_specs.append((target_ts.components, lags)) - if past_cov_ts and lags_past_covariates: - covariates_specs.append((past_cov_ts.components, lags_past_covariates)) - if future_cov_ts and lags_future_covariates: - covariates_specs.append((future_cov_ts.components, lags_future_covariates)) - - features_cols_name.append( - [ - f"lag_{lag_idx}_{comp_name}" - for variate_components, variates_lags in covariates_specs - for lag_idx in variates_lags - for comp_name in variate_components - ] + unique_components_names = set( + chain.from_iterable( + [list(ts.components) for ts in variate if ts is not None] + ) ) + if len(unique_components_names) == 0: + pass + elif variate_lags: + # using first ts components names + if len(unique_components_names) == variate[0].n_components: + covariates_specs.append((variate[0].components, variate_lags)) + # create generic feature names + else: + covariates_specs.append( + ( + [f"{prefix}_{i}" for i in range(variate[0].n_components)], + variate_lags, + ) + ) - labels_cols_name.append( - [ - f"lag_{lag_idx}_{comp_name}" - for lag_idx in range(0, output_chunk_length) - for comp_name in target_ts.components - ] - ) + features_cols_name = [ + f"lag_{lag_idx}_{comp_name}" + for variate_components, variates_lags in covariates_specs + for lag_idx in variates_lags + for comp_name in variate_components + ] - if concatenate: - features_cols_name[-1] += labels_cols_name[-1] + if target_series[0] and lags: + labels_cols_name = [ + f"lag_{lag_idx}_{comp_name}" + for lag_idx in range(0, output_chunk_length) + for comp_name in covariates_specs[0][0] + ] + else: + labels_cols_name = [] + + if concatenate: + features_cols_name += labels_cols_name return features_cols_name, labels_cols_name From f48ea8a1d50c14a45e466a217f96fc927194955c Mon Sep 17 00:00:00 2001 From: Antoine Madrona Date: Thu, 6 Apr 2023 11:47:53 +0200 Subject: [PATCH 06/12] fix: using the same naming convention for the lagged components as the explainability module --- .../forecasting/test_regression_models.py | 4 +- .../test_create_lagged_training_data.py | 108 +++++++++--------- darts/utils/data/tabularization.py | 25 ++-- 3 files changed, 74 insertions(+), 63 deletions(-) diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 0a8503aae4..f020da9dcd 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -1159,7 +1159,7 @@ def test_static_cov_accuracy(self): # multiple series with different components names ("smooth" and "irregular"), # triggers creation of generic feature names expected_features_in = [ - f"lag_{str(-i)}_target_0" for i in range(period // 2, 0, -1) + f"comp0_target_lag{str(-i)}" for i in range(period // 2, 0, -1) ] self.assertEqual( @@ -1179,7 +1179,7 @@ def test_static_cov_accuracy(self): # multiple univariates series with different names with same static cov expected_features_in = [ - f"lag_{str(-i)}_target_0" for i in range(period // 2, 0, -1) + f"comp0_target_lag{str(-i)}" for i in range(period // 2, 0, -1) ] + ["curve_type"] self.assertEqual( diff --git a/darts/tests/utils/tabularization/test_create_lagged_training_data.py b/darts/tests/utils/tabularization/test_create_lagged_training_data.py index e4a2c95000..3db2be4338 100644 --- a/darts/tests/utils/tabularization/test_create_lagged_training_data.py +++ b/darts/tests/utils/tabularization/test_create_lagged_training_data.py @@ -1851,7 +1851,7 @@ def test_create_lagged_components_names(self): ) # target no static covariate - expected_lagged_features = ["lag_-2_no_static", "lag_-1_no_static"] + expected_lagged_features = ["no_static_lag-2", "no_static_lag-1"] created_lagged_features, _ = create_lagged_components_names( target_series=target_with_no_cov, past_covariates=None, @@ -1865,10 +1865,10 @@ def test_create_lagged_components_names(self): # target with static covariate (not handled by this function) expected_lagged_features = [ - "lag_-4_static_0", - "lag_-4_static_1", - "lag_-1_static_0", - "lag_-1_static_1", + "static_0_lag-4", + "static_1_lag-4", + "static_0_lag-1", + "static_1_lag-1", ] created_lagged_features, _ = create_lagged_components_names( target_series=target_with_static_cov, @@ -1883,11 +1883,11 @@ def test_create_lagged_components_names(self): # target + past expected_lagged_features = [ - "lag_-4_no_static", - "lag_-3_no_static", - "lag_-1_past_0", - "lag_-1_past_1", - "lag_-1_past_2", + "no_static_lag-4", + "no_static_lag-3", + "past_0_lag-1", + "past_1_lag-1", + "past_2_lag-1", ] created_lagged_features, _ = create_lagged_components_names( target_series=target_with_no_cov, @@ -1902,12 +1902,12 @@ def test_create_lagged_components_names(self): # target + future expected_lagged_features = [ - "lag_-2_no_static", - "lag_-1_no_static", - "lag_3_future_0", - "lag_3_future_1", - "lag_3_future_2", - "lag_3_future_3", + "no_static_lag-2", + "no_static_lag-1", + "future_0_lag3", + "future_1_lag3", + "future_2_lag3", + "future_3_lag3", ] created_lagged_features, _ = create_lagged_components_names( target_series=target_with_no_cov, @@ -1922,13 +1922,13 @@ def test_create_lagged_components_names(self): # past + future expected_lagged_features = [ - "lag_-1_past_0", - "lag_-1_past_1", - "lag_-1_past_2", - "lag_2_future_0", - "lag_2_future_1", - "lag_2_future_2", - "lag_2_future_3", + "past_0_lag-1", + "past_1_lag-1", + "past_2_lag-1", + "future_0_lag2", + "future_1_lag2", + "future_2_lag2", + "future_3_lag2", ] created_lagged_features, _ = create_lagged_components_names( target_series=target_with_no_cov, @@ -1943,17 +1943,17 @@ def test_create_lagged_components_names(self): # target with static + past + future expected_lagged_features = [ - "lag_-2_static_0", - "lag_-2_static_1", - "lag_-1_static_0", - "lag_-1_static_1", - "lag_-1_past_0", - "lag_-1_past_1", - "lag_-1_past_2", - "lag_2_future_0", - "lag_2_future_1", - "lag_2_future_2", - "lag_2_future_3", + "static_0_lag-2", + "static_1_lag-2", + "static_0_lag-1", + "static_1_lag-1", + "past_0_lag-1", + "past_1_lag-1", + "past_2_lag-1", + "future_0_lag2", + "future_1_lag2", + "future_2_lag2", + "future_3_lag2", ] created_lagged_features, _ = create_lagged_components_names( target_series=target_with_static_cov, @@ -1968,15 +1968,15 @@ def test_create_lagged_components_names(self): # multiple series with same components, including past/future covariates expected_lagged_features = [ - "lag_-3_static_0", - "lag_-3_static_1", - "lag_-1_past_0", - "lag_-1_past_1", - "lag_-1_past_2", - "lag_2_future_0", - "lag_2_future_1", - "lag_2_future_2", - "lag_2_future_3", + "static_0_lag-3", + "static_1_lag-3", + "past_0_lag-1", + "past_1_lag-1", + "past_2_lag-1", + "future_0_lag2", + "future_1_lag2", + "future_2_lag2", + "future_3_lag2", ] created_lagged_features, _ = create_lagged_components_names( target_series=[target_with_static_cov, target_with_static_cov], @@ -1991,17 +1991,17 @@ def test_create_lagged_components_names(self): # multiple series with different components, same past, different future expected_lagged_features = [ - "lag_-2_target_0", - "lag_-2_target_1", - "lag_-1_target_0", - "lag_-1_target_1", - "lag_-1_past_0", - "lag_-1_past_1", - "lag_-1_past_2", - "lag_2_future_cov_0", - "lag_2_future_cov_1", - "lag_2_future_cov_2", - "lag_2_future_cov_3", + "comp0_target_lag-2", + "comp1_target_lag-2", + "comp0_target_lag-1", + "comp1_target_lag-1", + "past_0_lag-1", + "past_1_lag-1", + "past_2_lag-1", + "comp0_future_cov_lag2", + "comp1_future_cov_lag2", + "comp2_future_cov_lag2", + "comp3_future_cov_lag2", ] created_lagged_features, _ = create_lagged_components_names( target_series=[ diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index 9c9d01f473..6ddf51a67f 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -546,21 +546,28 @@ def create_lagged_components_names( ) -> Tuple[List[List[str]], List[List[str]]]: """ Helper function called to retrieve the name of the features and labels arrays created with - `create_lagged_data()`. The convention are the following: + `create_lagged_data()`. The order of the features is the following: Along the `n_lagged_features` axis, `X` has the following structure (for `*_lags=[-2,-1]` and `*_series.n_components = 2`): lagged_target | lagged_past_covariates | lagged_future_covariates where each `lagged_*` has the following structure: - lag_-2_comp_1_* | lag_-2_comp_2_* | lag_-1_comp_1_* | lag_-1_comp_2_* + comp0_*_lag-2 | comp1_*_lag-2 | comp0_*_lag_-1 | comp1_*_lag-1 Along the `n_lagged_labels` axis, `y` has the following structure (for `output_chunk_length=4` and `target_series.n_components=2`): - lag_+0_comp_1_target | lag_+0_comp_2_target | ... | lag_+3_comp_1_target | lag_+3_comp_2_target + comp0_target_lag0 | comp1_target_lag0 | ... | comp0_target_lag3 | comp1_target_lag3 Note : if `target_series`, `past_covariates` or `future_covariates` contain series with different components name, generic feature names will be created (independently for each variate). + The component name convention is ``"{name}_lag{idx}"``, where: + + - ``{name}`` is the original component name if it shared across all the TimeSeries, or generic + names ``"comp{i}_{cov_type}`` where {i} is the index of the component and {cov_type} one of + ``"target"``, ``"past_cov"`` or ``"future_cov"``. + - ``{idx}`` is the lag index. + Returns ------- features_cols_name @@ -590,7 +597,7 @@ def create_lagged_components_names( ) covariates_specs = [] - for variate, variate_lags, prefix in zip( + for variate, variate_lags, variate_type in zip( [target_series, past_covariates, future_covariates], [lags, lags_past_covariates, lags_future_covariates], ["target", "past_cov", "future_cov"], @@ -600,6 +607,7 @@ def create_lagged_components_names( [list(ts.components) for ts in variate if ts is not None] ) ) + # variate is None if len(unique_components_names) == 0: pass elif variate_lags: @@ -610,13 +618,16 @@ def create_lagged_components_names( else: covariates_specs.append( ( - [f"{prefix}_{i}" for i in range(variate[0].n_components)], + [ + f"comp{i}_{variate_type}" + for i in range(variate[0].n_components) + ], variate_lags, ) ) - + # using the same convention as the explainability module features_cols_name = [ - f"lag_{lag_idx}_{comp_name}" + f"{comp_name}_lag{lag_idx}" for variate_components, variates_lags in covariates_specs for lag_idx in variates_lags for comp_name in variate_components From 9fa93cbdde2a2c0229cc92901a9233e169c64c5c Mon Sep 17 00:00:00 2001 From: dennisbader Date: Thu, 6 Apr 2023 18:37:34 +0200 Subject: [PATCH 07/12] refactor and fix some type hint warnings --- darts/models/forecasting/regression_model.py | 25 ++-- .../test_create_lagged_training_data.py | 22 ++-- darts/utils/data/tabularization.py | 115 ++++++++---------- 3 files changed, 73 insertions(+), 89 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 8174a3c1d3..26c74c3163 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -36,7 +36,7 @@ from darts.models.forecasting.forecasting_model import GlobalForecastingModel from darts.timeseries import TimeSeries from darts.utils.data.tabularization import ( - create_lagged_components_names, + create_lagged_component_names, create_lagged_training_data, ) from darts.utils.multioutput import MultiOutputRegressor @@ -361,26 +361,23 @@ def _create_lagged_data( return training_samples, training_labels - def _create_lagged_components_name( + def _create_lagged_component_names( self, target_series, past_covariates, future_covariates ): - lags = self.lags.get("target") - lags_past_covariates = self.lags.get("past") - lags_future_covariates = self.lags.get("future") - features_cols_name, labels_cols_name = create_lagged_components_names( + features_cols_name, labels_cols_name = create_lagged_component_names( target_series=target_series, past_covariates=past_covariates, future_covariates=future_covariates, - lags=lags, - lags_past_covariates=lags_past_covariates, - lags_future_covariates=lags_future_covariates, + lags=self.lags.get("target"), + lags_past_covariates=self.lags.get("past"), + lags_future_covariates=self.lags.get("future"), output_chunk_length=self.output_chunk_length, concatenate=False, ) # adding the static covariates on the right of each features_cols_name - features_cols_name = self._add_static_covariates_name( + features_cols_name = self._add_static_covariate_names( features_cols_name, target_series, ) @@ -474,7 +471,7 @@ def _add_static_covariates( features = features[0] return features - def _add_static_covariates_name( + def _add_static_covariate_names( self, features_cols_name: List[List[str]], target_series: Union[TimeSeries, Sequence[TimeSeries]], @@ -488,7 +485,7 @@ def _add_static_covariates_name( ---------- features_cols_name The name of the features of the numpy array(s) to which the static covariates will be added, generated with - `create_lagged_components_names()` + `create_lagged_component_names()` target_series The target series from which to read the static covariates. @@ -535,12 +532,12 @@ def _fit_model( self.model.fit(training_samples, training_labels, **kwargs) # generate and store the lagged components names (for feature importance analysis) - lagged_features_names, _ = self._create_lagged_components_name( + lagged_feature_names, _ = self._create_lagged_component_names( target_series=target_series, past_covariates=past_covariates, future_covariates=future_covariates, ) - self.model.lagged_features_name_ = lagged_features_names + self.model.lagged_features_name_ = lagged_feature_names def fit( self, diff --git a/darts/tests/utils/tabularization/test_create_lagged_training_data.py b/darts/tests/utils/tabularization/test_create_lagged_training_data.py index 3db2be4338..0767075a97 100644 --- a/darts/tests/utils/tabularization/test_create_lagged_training_data.py +++ b/darts/tests/utils/tabularization/test_create_lagged_training_data.py @@ -10,7 +10,7 @@ from darts.logging import get_logger, raise_if, raise_if_not, raise_log from darts.tests.base_test_class import DartsBaseTestClass from darts.utils.data.tabularization import ( - create_lagged_components_names, + create_lagged_component_names, create_lagged_training_data, ) from darts.utils.timeseries_generation import linear_timeseries @@ -1803,9 +1803,9 @@ def test_lagged_training_data_unspecified_lag_or_series_warning(self): ) self.assertEqual(len(w), 0) - def test_create_lagged_components_names(self): + def test_create_lagged_component_names(self): """ - Tests that `create_lagged_components_names` produces the expected features name depending + Tests that `create_lagged_component_names` produces the expected features name depending on the lags, output_chunk_length and covariates. """ target_with_no_cov = self.create_multivariate_linear_timeseries( @@ -1852,7 +1852,7 @@ def test_create_lagged_components_names(self): # target no static covariate expected_lagged_features = ["no_static_lag-2", "no_static_lag-1"] - created_lagged_features, _ = create_lagged_components_names( + created_lagged_features, _ = create_lagged_component_names( target_series=target_with_no_cov, past_covariates=None, future_covariates=None, @@ -1870,7 +1870,7 @@ def test_create_lagged_components_names(self): "static_0_lag-1", "static_1_lag-1", ] - created_lagged_features, _ = create_lagged_components_names( + created_lagged_features, _ = create_lagged_component_names( target_series=target_with_static_cov, past_covariates=None, future_covariates=None, @@ -1889,7 +1889,7 @@ def test_create_lagged_components_names(self): "past_1_lag-1", "past_2_lag-1", ] - created_lagged_features, _ = create_lagged_components_names( + created_lagged_features, _ = create_lagged_component_names( target_series=target_with_no_cov, past_covariates=past, future_covariates=None, @@ -1909,7 +1909,7 @@ def test_create_lagged_components_names(self): "future_2_lag3", "future_3_lag3", ] - created_lagged_features, _ = create_lagged_components_names( + created_lagged_features, _ = create_lagged_component_names( target_series=target_with_no_cov, past_covariates=None, future_covariates=future, @@ -1930,7 +1930,7 @@ def test_create_lagged_components_names(self): "future_2_lag2", "future_3_lag2", ] - created_lagged_features, _ = create_lagged_components_names( + created_lagged_features, _ = create_lagged_component_names( target_series=target_with_no_cov, past_covariates=past, future_covariates=future, @@ -1955,7 +1955,7 @@ def test_create_lagged_components_names(self): "future_2_lag2", "future_3_lag2", ] - created_lagged_features, _ = create_lagged_components_names( + created_lagged_features, _ = create_lagged_component_names( target_series=target_with_static_cov, past_covariates=past, future_covariates=future, @@ -1978,7 +1978,7 @@ def test_create_lagged_components_names(self): "future_2_lag2", "future_3_lag2", ] - created_lagged_features, _ = create_lagged_components_names( + created_lagged_features, _ = create_lagged_component_names( target_series=[target_with_static_cov, target_with_static_cov], past_covariates=[past, past], future_covariates=[future, future], @@ -2003,7 +2003,7 @@ def test_create_lagged_components_names(self): "comp2_future_cov_lag2", "comp3_future_cov_lag2", ] - created_lagged_features, _ = create_lagged_components_names( + created_lagged_features, _ = create_lagged_component_names( target_series=[ target_with_static_cov, target_with_no_cov.stack(target_with_no_cov), diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index 6ddf51a67f..f3d3bc05af 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -1,6 +1,5 @@ import warnings from functools import reduce -from itertools import chain from math import inf from typing import List, Optional, Sequence, Tuple, Union @@ -232,7 +231,7 @@ def create_lagged_data( See Also -------- - tabularization.create_lagged_components_names : return the lagged features names as a list of strings. + tabularization.create_lagged_component_names : return the lagged features names as a list of strings. """ raise_if( @@ -437,7 +436,7 @@ def create_lagged_prediction_data( check_inputs: bool = True, use_moving_windows: bool = True, concatenate: bool = True, -) -> Tuple[ArrayOrArraySequence, Union[None, ArrayOrArraySequence], Sequence[pd.Index]]: +) -> Tuple[ArrayOrArraySequence, Sequence[pd.Index]]: """ Creates the features array `X` to produce a series of prediction from an already-trained regression model; the time index values of each observation is also returned. @@ -534,7 +533,7 @@ def create_lagged_prediction_data( return X, times -def create_lagged_components_names( +def create_lagged_component_names( target_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, @@ -582,70 +581,51 @@ def create_lagged_components_names( -------- tabularization.create_lagged_data : generate the lagged features and labels as (list of) Arrays. """ - target_series = ( - [target_series] if not isinstance(target_series, Sequence) else target_series - ) - past_covariates = ( - [past_covariates] - if not isinstance(past_covariates, Sequence) - else past_covariates - ) - future_covariates = ( - [future_covariates] - if not isinstance(future_covariates, Sequence) - else future_covariates - ) + target_series = series2seq(target_series) + past_covariates = series2seq(past_covariates) + future_covariates = series2seq(future_covariates) - covariates_specs = [] + lagged_feature_names = [] + label_feature_names = [] for variate, variate_lags, variate_type in zip( [target_series, past_covariates, future_covariates], [lags, lags_past_covariates, lags_future_covariates], ["target", "past_cov", "future_cov"], ): - unique_components_names = set( - chain.from_iterable( - [list(ts.components) for ts in variate if ts is not None] - ) - ) - # variate is None - if len(unique_components_names) == 0: - pass - elif variate_lags: - # using first ts components names - if len(unique_components_names) == variate[0].n_components: - covariates_specs.append((variate[0].components, variate_lags)) - # create generic feature names - else: - covariates_specs.append( - ( - [ - f"comp{i}_{variate_type}" - for i in range(variate[0].n_components) - ], - variate_lags, - ) - ) - # using the same convention as the explainability module - features_cols_name = [ - f"{comp_name}_lag{lag_idx}" - for variate_components, variates_lags in covariates_specs - for lag_idx in variates_lags - for comp_name in variate_components - ] + if variate is None or variate_lags is None: + continue + + use_specific_component_names = True + components = variate[0].components + # terminate early if we find any variates with different component names + for ts in variate: + if not components.equals(ts.components): + use_specific_component_names = False + break + + if use_specific_component_names: + names = components.tolist() + else: + names = [f"comp{i}_{variate_type}" for i in range(len(components))] - if target_series[0] and lags: - labels_cols_name = [ - f"lag_{lag_idx}_{comp_name}" - for lag_idx in range(0, output_chunk_length) - for comp_name in covariates_specs[0][0] + # using the same convention as the explainability module + lagged_feature_names += [ + f"{comp_name}_lag{lag_idx}" + for lag_idx in variate_lags + for comp_name in names ] - else: - labels_cols_name = [] + + if variate_type == "target" and lags: + label_feature_names = [ + f"lag_{lag_idx}_{comp_name}" + for lag_idx in range(output_chunk_length) + for comp_name in names + ] if concatenate: - features_cols_name += labels_cols_name + lagged_feature_names += label_feature_names - return features_cols_name, labels_cols_name + return lagged_feature_names, label_feature_names def _create_lagged_data_by_moving_window( @@ -713,6 +693,8 @@ def _create_lagged_data_by_moving_window( start_time = times[0] # Construct features array X: X = [] + start_time_idx = None + target_start_time_idx = None for i, (series_i, lags_i, min_lag_i, max_lag_i) in enumerate( zip( [target_series, past_covariates, future_covariates], @@ -823,7 +805,7 @@ def _extract_lagged_vals_from_windows( # windows.shape = (num_windows, window_len, num_components, num_samples): windows = np.moveaxis(windows, (0, 3, 1, 2), (0, 1, 2, 3)) # lagged_vals.shape = (num_windows, num_components*window_len, num_samples): - lagged_vals = windows.reshape(windows.shape[0], -1, windows.shape[-1]) + lagged_vals = windows.reshape((windows.shape[0], -1, windows.shape[-1])) return lagged_vals @@ -875,6 +857,8 @@ def _create_lagged_data_by_intersecting_times( if len(shared_times) > max_samples_per_ts: shared_times = shared_times[-max_samples_per_ts:] X = [] + shared_time_idx = None + label_shared_time_idx = None for i, (series_i, lags_i, min_lag_i) in enumerate( zip( [target_series, past_covariates, future_covariates], @@ -940,10 +924,12 @@ def _create_lagged_data_by_intersecting_times( # For convenience, define following types for `_get_feature_times`: FeatureTimes = Tuple[ - Union[pd.Index, None], Union[pd.Index, None], Union[pd.Index, None] + Optional[Union[pd.Index, pd.DatetimeIndex, pd.RangeIndex]], + Optional[Union[pd.Index, pd.DatetimeIndex, pd.RangeIndex]], + Optional[Union[pd.Index, pd.DatetimeIndex, pd.RangeIndex]], ] -MinLags = Tuple[Union[int, None], Union[int, None], Union[int, None]] -MaxLags = Tuple[Union[int, None], Union[int, None], Union[int, None]] +MinLags = Tuple[Optional[int], Optional[int], Optional[int]] +MaxLags = Tuple[Optional[int], Optional[int], Optional[int]] def _get_feature_times( @@ -1178,7 +1164,7 @@ def _get_feature_times( def get_shared_times( - *series_or_times: Sequence[Union[TimeSeries, pd.Index, None]], sort: bool = True + *series_or_times: Union[TimeSeries, pd.Index, None], sort: bool = True ) -> pd.Index: """ Returns the times shared by all of the specified `TimeSeries` or time indexes (i.e. the intersection of all @@ -1440,7 +1426,7 @@ def _extend_time_index( return time_index -def _get_freqs(*series: Sequence[Union[TimeSeries, None]]): +def _get_freqs(*series: Union[TimeSeries, None]): """ Returns list with the frequency of all of the specified (i.e. non-`None`) `series`. """ @@ -1451,7 +1437,7 @@ def _get_freqs(*series: Sequence[Union[TimeSeries, None]]): return freqs -def _all_equal_freq(*series: Sequence[Union[TimeSeries, None]]) -> bool: +def _all_equal_freq(*series: Union[TimeSeries, None]) -> bool: """ Returns `True` is all of the specified (i.e. non-`None`) `series` have the same frequency. """ @@ -1499,6 +1485,7 @@ def _check_series_length( is_target = name == "target_series" is_label_series = is_training and is_target lags_specified = lags is not None + minimum_len, minimum_len_str = None, None if is_label_series: minimum_len_str = ( "-min(lags) + output_chunk_length" From 38c10a04e5cf44c1e0999b7e93d7d407b413c0fa Mon Sep 17 00:00:00 2001 From: dennisbader Date: Tue, 11 Apr 2023 21:36:33 +0200 Subject: [PATCH 08/12] simplified lagged feature name generation and moved out of regression model --- darts/models/forecasting/regression_model.py | 86 ++++++------------- .../test_add_static_covariates.py | 8 +- darts/utils/data/tabularization.py | 66 +++++++------- 3 files changed, 69 insertions(+), 91 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 65102e8f27..582b5c77e3 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -137,6 +137,7 @@ def __init__( self.multi_models = multi_models self._considers_static_covariates = use_static_covariates self._static_covariates_shape: Optional[Tuple[int, int]] = None + self._lagged_feature_names: Optional[List[str]] = None # model checks if self.model is None: @@ -383,64 +384,6 @@ def _create_lagged_data( return training_samples, training_labels - def _create_lagged_component_names( - self, target_series, past_covariates, future_covariates - ): - - features_cols_name, labels_cols_name = create_lagged_component_names( - target_series=target_series, - past_covariates=past_covariates, - future_covariates=future_covariates, - lags=self.lags.get("target"), - lags_past_covariates=self.lags.get("past"), - lags_future_covariates=self.lags.get("future"), - output_chunk_length=self.output_chunk_length, - concatenate=False, - ) - - # adding the static covariates on the right of each features_cols_name - features_cols_name = self._add_static_covariate_names( - features_cols_name, - target_series, - ) - - return features_cols_name, labels_cols_name - - def _add_static_covariate_names( - self, - features_cols_name: List[List[str]], - target_series: Union[TimeSeries, Sequence[TimeSeries]], - ) -> Union[np.array, Sequence[np.array]]: - """ - Add static covariates names to the features name for RegressionModels. - Accounts for series with potentially different static covariates to accomodate for the maximum - number of available static_covariates in any of the given series in the sequence. - - Parameters - ---------- - features_cols_name - The name of the features of the numpy array(s) to which the static covariates will be added, generated with - `create_lagged_component_names()` - target_series - The target series from which to read the static covariates. - - Returns - ------- - features_cols_name - The features' name list with appended static covariates names on the right. - """ - target_series = series2seq(target_series) - - # collect static covariates info, preserve the order - static_covs_names = [] - for ts in target_series: - if ts.has_static_covariates: - for static_cov_name in ts.static_covariates.keys(): - if static_cov_name not in static_covs_names: - static_covs_names.append(static_cov_name) - - return features_cols_name + static_covs_names - def _fit_model( self, target_series, @@ -467,12 +410,17 @@ def _fit_model( self.model.fit(training_samples, training_labels, **kwargs) # generate and store the lagged components names (for feature importance analysis) - lagged_feature_names, _ = self._create_lagged_component_names( + self._lagged_feature_names, _ = create_lagged_component_names( target_series=target_series, past_covariates=past_covariates, future_covariates=future_covariates, + lags=self.lags.get("target"), + lags_past_covariates=self.lags.get("past"), + lags_future_covariates=self.lags.get("future"), + output_chunk_length=self.output_chunk_length, + concatenate=False, + use_static_covariates=self.uses_static_covariates, ) - self.model.lagged_features_name_ = lagged_feature_names def fit( self, @@ -833,6 +781,24 @@ def _predict_and_sample( return prediction.reshape(k, self.pred_dim, -1) + @property + def lagged_feature_names(self) -> Optional[List[str]]: + """The lagged feature names the model has been trained on. + + The naming convention for target, past and future covariates is: ``"{name}_{type}_lag{i}"``, where: + + - ``{name}`` the component name of the (first) series + - ``{type}`` is the feature type, one of "target", "pastcov", and "futcov" + - ``{i}`` is the lag value + + The naming convention for static covariates is: ``"{name}_statcov_target_{comp}"``, where: + + - ``{name}`` the static covariate name of the (first) series + - ``{comp}`` the target component name of the (first) that the static covariate act on. If the static + covariate acts globally on a multivariate target series, will show "global". + """ + return self._lagged_feature_names + def __str__(self): return self.model.__str__() diff --git a/darts/tests/utils/tabularization/test_add_static_covariates.py b/darts/tests/utils/tabularization/test_add_static_covariates.py index 883a2e2a4c..ec0012b55d 100644 --- a/darts/tests/utils/tabularization/test_add_static_covariates.py +++ b/darts/tests/utils/tabularization/test_add_static_covariates.py @@ -17,7 +17,7 @@ class AddStaticToLaggedDataTestCase(DartsBaseTestClass): pd.DataFrame({"a": [0.0], "b": [1.0]}) ) series_stcov_multivar = series.with_static_covariates( - pd.DataFrame({"a": [0.0, 1.0], "b": [1.0, 2.0]}) + pd.DataFrame({"a": [0.0, 1.0], "b": [10.0, 20.0]}) ) features = np.empty(shape=(len(series), 2)) @@ -104,6 +104,9 @@ def test_add_static_covs_train(self): ) assert [features_.shape == expected_shape for features_ in features] assert last_shape == self.series_stcov_multivar.static_covariates.shape + assert np.all( + features[0][:, -sum(last_shape) :] == np.array([0.0, 1.0, 10.0, 20.0]) + ) def test_add_static_covs_predict(self): # predicting when `last_shape` other than `None` @@ -179,3 +182,6 @@ def test_add_static_covs_predict(self): ) assert [features_.shape == expected_shape for features_ in features] assert last_shape == self.series_stcov_multivar.static_covariates.shape + assert np.all( + features[0][:, -sum(last_shape) :] == np.array([0.0, 1.0, 10.0, 20.0]) + ) diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index 901fd32c20..5f3d336149 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -14,7 +14,7 @@ from darts.logging import get_logger, raise_if, raise_if_not, raise_log from darts.timeseries import TimeSeries -from darts.utils.utils import series2seq +from darts.utils.utils import get_single_series, series2seq logger = get_logger(__name__) @@ -630,6 +630,7 @@ def create_lagged_component_names( lags_future_covariates: Optional[Sequence[int]] = None, output_chunk_length: int = 1, concatenate: bool = True, + use_static_covariates: bool = False, ) -> Tuple[List[List[str]], List[List[str]]]: """ Helper function called to retrieve the name of the features and labels arrays created with @@ -637,23 +638,30 @@ def create_lagged_component_names( Along the `n_lagged_features` axis, `X` has the following structure (for `*_lags=[-2,-1]` and `*_series.n_components = 2`): - lagged_target | lagged_past_covariates | lagged_future_covariates + lagged_target | lagged_past_covariates | lagged_future_covariates | static covariates where each `lagged_*` has the following structure: comp0_*_lag-2 | comp1_*_lag-2 | comp0_*_lag_-1 | comp1_*_lag-1 + and for static covariates (2 static covariates acting on 2 target components): + cov0_*_target_comp0 | cov0_*_target_comp1 | cov1_*_target_comp0 | cov1_*_target_comp1 Along the `n_lagged_labels` axis, `y` has the following structure (for `output_chunk_length=4` and `target_series.n_components=2`): comp0_target_lag0 | comp1_target_lag0 | ... | comp0_target_lag3 | comp1_target_lag3 - Note : if `target_series`, `past_covariates` or `future_covariates` contain series with different - components name, generic feature names will be created (independently for each variate). + Note : will only use the component names of the first series from `target_series`, `past_covariates`, + `future_covariates`, and static_covariates. + + The naming convention for target, past and future covariates is: ``"{name}_{type}_lag{i}"``, where: + + - ``{name}`` the component name of the (first) series + - ``{type}`` is the feature type, one of "target", "pastcov", and "futcov" + - ``{i}`` is the lag value - The component name convention is ``"{name}_lag{idx}"``, where: + The naming convention for static covariates is: ``"{name}_statcov_target_{comp}"``, where: - - ``{name}`` is the original component name if it shared across all the TimeSeries, or generic - names ``"comp{i}_{cov_type}`` where {i} is the index of the component and {cov_type} one of - ``"target"``, ``"past_cov"`` or ``"future_cov"``. - - ``{idx}`` is the lag index. + - ``{name}`` the static covariate name of the (first) series + - ``{comp}`` the target component name of the (first) that the static covariate act on. If the static + covariate acts globally on a multivariate target series, will show "global". Returns ------- @@ -678,38 +686,36 @@ def create_lagged_component_names( for variate, variate_lags, variate_type in zip( [target_series, past_covariates, future_covariates], [lags, lags_past_covariates, lags_future_covariates], - ["target", "past_cov", "future_cov"], + ["target", "pastcov", "futcov"], ): if variate is None or variate_lags is None: continue - use_specific_component_names = True - components = variate[0].components - # terminate early if we find any variates with different component names - for ts in variate: - if not components.equals(ts.components): - use_specific_component_names = False - break - - if use_specific_component_names: - names = components.tolist() - else: - names = [f"comp{i}_{variate_type}" for i in range(len(components))] - - # using the same convention as the explainability module + components = get_single_series(variate).components.tolist() lagged_feature_names += [ - f"{comp_name}_lag{lag_idx}" - for lag_idx in variate_lags - for comp_name in names + f"{name}_{variate_type}_lag{lag}" + for lag in variate_lags + for name in components ] if variate_type == "target" and lags: label_feature_names = [ - f"lag_{lag_idx}_{comp_name}" - for lag_idx in range(output_chunk_length) - for comp_name in names + f"{name}_target_lag{lag}" + for lag in range(output_chunk_length) + for name in components ] + # static covariates + if use_static_covariates: + static_covs = get_single_series(target_series).static_covariates + # static covariate names + names = static_covs.columns.tolist() + # target components that the static covariates reference to + comps = static_covs.index.tolist() + lagged_feature_names += [ + f"{name}_statcov_target_{comp}" for name in names for comp in comps + ] + if concatenate: lagged_feature_names += label_feature_names From 1325983064118cc26577143c11b5b0cfd8333269 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Tue, 11 Apr 2023 21:56:59 +0200 Subject: [PATCH 09/12] fix regr model tests --- .../models/forecasting/test_regression_models.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 57aa2b0e55..8857864287 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -834,7 +834,6 @@ def test_static_cov_accuracy(self): model_static_cov = RandomForest(lags=period // 2, bootstrap=False) model_static_cov.fit(fitting_series) pred_static_cov = model_static_cov.predict(n=period, series=fitting_series) - # then for series, ps_no_st, ps_st_cat in zip( train_series_static_cov, pred_no_static_cov, pred_static_cov @@ -856,14 +855,11 @@ def test_static_cov_accuracy(self): n=period, series=fitting_series ) # multiple series with different components names ("smooth" and "irregular"), - # triggers creation of generic feature names + # will take first target name expected_features_in = [ - f"comp0_target_lag{str(-i)}" for i in range(period // 2, 0, -1) + f"smooth_target_lag{str(-i)}" for i in range(period // 2, 0, -1) ] - - self.assertEqual( - model_no_static_cov.model.lagged_features_name_, expected_features_in - ) + self.assertEqual(model_no_static_cov.lagged_feature_names, expected_features_in) self.assertEqual( len(model_no_static_cov.model.feature_importances_), len(expected_features_in), @@ -879,11 +875,9 @@ def test_static_cov_accuracy(self): # multiple univariates series with different names with same static cov expected_features_in = [ f"comp0_target_lag{str(-i)}" for i in range(period // 2, 0, -1) - ] + ["curve_type"] + ] + ["curve_type_statcov_target_smooth"] - self.assertEqual( - model_static_cov.model.lagged_features_name_, expected_features_in - ) + self.assertEqual(model_static_cov.lagged_feature_names, expected_features_in) self.assertEqual( len(model_static_cov.model.feature_importances_), len(expected_features_in), From dd3798ba9f90fd4e6264aa75fae17997c26fd21e Mon Sep 17 00:00:00 2001 From: dennisbader Date: Tue, 11 Apr 2023 22:17:57 +0200 Subject: [PATCH 10/12] fix create lagged data tests --- .../test_create_lagged_training_data.py | 197 ++++++++++++------ 1 file changed, 138 insertions(+), 59 deletions(-) diff --git a/darts/tests/utils/tabularization/test_create_lagged_training_data.py b/darts/tests/utils/tabularization/test_create_lagged_training_data.py index 0767075a97..edfffc5052 100644 --- a/darts/tests/utils/tabularization/test_create_lagged_training_data.py +++ b/darts/tests/utils/tabularization/test_create_lagged_training_data.py @@ -1817,9 +1817,9 @@ def test_create_lagged_component_names(self): length=10, freq=2, ) - + n_comp = 2 target_with_static_cov = self.create_multivariate_linear_timeseries( - n_components=2, + n_components=n_comp, components_names=["static_0", "static_1"], start_value=0, end_value=10, @@ -1828,7 +1828,20 @@ def test_create_lagged_component_names(self): freq=2, ) target_with_static_cov = target_with_static_cov.with_static_covariates( - pd.Series([1], index=["dummy_static_cov"]) + pd.DataFrame({"dummy": [1]}) # leads to "global" static cov component name + ) + target_with_static_cov2 = target_with_static_cov.with_static_covariates( + pd.DataFrame( + {"dummy": [i for i in range(n_comp)]} + ) # leads to sharing target component names + ) + target_with_static_cov3 = target_with_static_cov.with_static_covariates( + pd.DataFrame( + { + "dummy": [i for i in range(n_comp)], + "dummy1": [i for i in range(n_comp)], + } + ) # leads to sharing target component names ) past = self.create_multivariate_linear_timeseries( @@ -1851,7 +1864,7 @@ def test_create_lagged_component_names(self): ) # target no static covariate - expected_lagged_features = ["no_static_lag-2", "no_static_lag-1"] + expected_lagged_features = ["no_static_target_lag-2", "no_static_target_lag-1"] created_lagged_features, _ = create_lagged_component_names( target_series=target_with_no_cov, past_covariates=None, @@ -1860,15 +1873,16 @@ def test_create_lagged_component_names(self): lags_past_covariates=None, lags_future_covariates=None, concatenate=False, + use_static_covariates=False, ) self.assertEqual(expected_lagged_features, created_lagged_features) - # target with static covariate (not handled by this function) + # target with static covariate (but don't use them in feature names) expected_lagged_features = [ - "static_0_lag-4", - "static_1_lag-4", - "static_0_lag-1", - "static_1_lag-1", + "static_0_target_lag-4", + "static_1_target_lag-4", + "static_0_target_lag-1", + "static_1_target_lag-1", ] created_lagged_features, _ = create_lagged_component_names( target_series=target_with_static_cov, @@ -1878,16 +1892,81 @@ def test_create_lagged_component_names(self): lags_past_covariates=None, lags_future_covariates=None, concatenate=False, + use_static_covariates=False, + ) + self.assertEqual(expected_lagged_features, created_lagged_features) + + # target with static covariate (acting on global target components) + expected_lagged_features = [ + "static_0_target_lag-4", + "static_1_target_lag-4", + "static_0_target_lag-1", + "static_1_target_lag-1", + "dummy_statcov_target_global_components", + ] + created_lagged_features, _ = create_lagged_component_names( + target_series=target_with_static_cov, + past_covariates=None, + future_covariates=None, + lags=[-4, -1], + lags_past_covariates=None, + lags_future_covariates=None, + concatenate=False, + use_static_covariates=True, + ) + self.assertEqual(expected_lagged_features, created_lagged_features) + + # target with static covariate (component specific) + expected_lagged_features = [ + "static_0_target_lag-4", + "static_1_target_lag-4", + "static_0_target_lag-1", + "static_1_target_lag-1", + "dummy_statcov_target_static_0", + "dummy_statcov_target_static_1", + ] + created_lagged_features, _ = create_lagged_component_names( + target_series=target_with_static_cov2, + past_covariates=None, + future_covariates=None, + lags=[-4, -1], + lags_past_covariates=None, + lags_future_covariates=None, + concatenate=False, + use_static_covariates=True, + ) + self.assertEqual(expected_lagged_features, created_lagged_features) + + # target with static covariate (component specific & multivariate) + expected_lagged_features = [ + "static_0_target_lag-4", + "static_1_target_lag-4", + "static_0_target_lag-1", + "static_1_target_lag-1", + "dummy_statcov_target_static_0", + "dummy_statcov_target_static_1", + "dummy1_statcov_target_static_0", + "dummy1_statcov_target_static_1", + ] + created_lagged_features, _ = create_lagged_component_names( + target_series=target_with_static_cov3, + past_covariates=None, + future_covariates=None, + lags=[-4, -1], + lags_past_covariates=None, + lags_future_covariates=None, + concatenate=False, + use_static_covariates=True, ) self.assertEqual(expected_lagged_features, created_lagged_features) # target + past expected_lagged_features = [ - "no_static_lag-4", - "no_static_lag-3", - "past_0_lag-1", - "past_1_lag-1", - "past_2_lag-1", + "no_static_target_lag-4", + "no_static_target_lag-3", + "past_0_pastcov_lag-1", + "past_1_pastcov_lag-1", + "past_2_pastcov_lag-1", ] created_lagged_features, _ = create_lagged_component_names( target_series=target_with_no_cov, @@ -1902,12 +1981,12 @@ def test_create_lagged_component_names(self): # target + future expected_lagged_features = [ - "no_static_lag-2", - "no_static_lag-1", - "future_0_lag3", - "future_1_lag3", - "future_2_lag3", - "future_3_lag3", + "no_static_target_lag-2", + "no_static_target_lag-1", + "future_0_futcov_lag3", + "future_1_futcov_lag3", + "future_2_futcov_lag3", + "future_3_futcov_lag3", ] created_lagged_features, _ = create_lagged_component_names( target_series=target_with_no_cov, @@ -1922,13 +2001,13 @@ def test_create_lagged_component_names(self): # past + future expected_lagged_features = [ - "past_0_lag-1", - "past_1_lag-1", - "past_2_lag-1", - "future_0_lag2", - "future_1_lag2", - "future_2_lag2", - "future_3_lag2", + "past_0_pastcov_lag-1", + "past_1_pastcov_lag-1", + "past_2_pastcov_lag-1", + "future_0_futcov_lag2", + "future_1_futcov_lag2", + "future_2_futcov_lag2", + "future_3_futcov_lag2", ] created_lagged_features, _ = create_lagged_component_names( target_series=target_with_no_cov, @@ -1943,17 +2022,17 @@ def test_create_lagged_component_names(self): # target with static + past + future expected_lagged_features = [ - "static_0_lag-2", - "static_1_lag-2", - "static_0_lag-1", - "static_1_lag-1", - "past_0_lag-1", - "past_1_lag-1", - "past_2_lag-1", - "future_0_lag2", - "future_1_lag2", - "future_2_lag2", - "future_3_lag2", + "static_0_target_lag-2", + "static_1_target_lag-2", + "static_0_target_lag-1", + "static_1_target_lag-1", + "past_0_pastcov_lag-1", + "past_1_pastcov_lag-1", + "past_2_pastcov_lag-1", + "future_0_futcov_lag2", + "future_1_futcov_lag2", + "future_2_futcov_lag2", + "future_3_futcov_lag2", ] created_lagged_features, _ = create_lagged_component_names( target_series=target_with_static_cov, @@ -1968,15 +2047,15 @@ def test_create_lagged_component_names(self): # multiple series with same components, including past/future covariates expected_lagged_features = [ - "static_0_lag-3", - "static_1_lag-3", - "past_0_lag-1", - "past_1_lag-1", - "past_2_lag-1", - "future_0_lag2", - "future_1_lag2", - "future_2_lag2", - "future_3_lag2", + "static_0_target_lag-3", + "static_1_target_lag-3", + "past_0_pastcov_lag-1", + "past_1_pastcov_lag-1", + "past_2_pastcov_lag-1", + "future_0_futcov_lag2", + "future_1_futcov_lag2", + "future_2_futcov_lag2", + "future_3_futcov_lag2", ] created_lagged_features, _ = create_lagged_component_names( target_series=[target_with_static_cov, target_with_static_cov], @@ -1989,19 +2068,19 @@ def test_create_lagged_component_names(self): ) self.assertEqual(expected_lagged_features, created_lagged_features) - # multiple series with different components, same past, different future + # multiple series with different components will use the first series as reference expected_lagged_features = [ - "comp0_target_lag-2", - "comp1_target_lag-2", - "comp0_target_lag-1", - "comp1_target_lag-1", - "past_0_lag-1", - "past_1_lag-1", - "past_2_lag-1", - "comp0_future_cov_lag2", - "comp1_future_cov_lag2", - "comp2_future_cov_lag2", - "comp3_future_cov_lag2", + "static_0_target_lag-2", + "static_1_target_lag-2", + "static_0_target_lag-1", + "static_1_target_lag-1", + "past_0_pastcov_lag-1", + "past_1_pastcov_lag-1", + "past_2_pastcov_lag-1", + "future_0_futcov_lag2", + "future_1_futcov_lag2", + "future_2_futcov_lag2", + "future_3_futcov_lag2", ] created_lagged_features, _ = create_lagged_component_names( target_series=[ From 557dbfe138a087cd5370f204280536095857151f Mon Sep 17 00:00:00 2001 From: dennisbader Date: Tue, 11 Apr 2023 22:48:33 +0200 Subject: [PATCH 11/12] fix small bug in unit test --- darts/tests/models/forecasting/test_regression_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 8857864287..4eeadda9ee 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -872,9 +872,9 @@ def test_static_cov_accuracy(self): model_static_cov = RandomForest(lags=period // 2, bootstrap=False) model_static_cov.fit(fitting_series) - # multiple univariates series with different names with same static cov + # multiple univariates series with different names with same static cov, will take name of first series expected_features_in = [ - f"comp0_target_lag{str(-i)}" for i in range(period // 2, 0, -1) + f"smooth_target_lag{str(-i)}" for i in range(period // 2, 0, -1) ] + ["curve_type_statcov_target_smooth"] self.assertEqual(model_static_cov.lagged_feature_names, expected_features_in) From 05428e55572f6a3adff567edd08e13164798d5f7 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Wed, 12 Apr 2023 00:15:08 +0200 Subject: [PATCH 12/12] fix bug in unittest from last PR --- .../models/forecasting/test_regression_models.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 4eeadda9ee..d62b1c134a 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -772,7 +772,11 @@ def test_optional_static_covariates(self): assert not model.uses_static_covariates assert model._static_covariates_shape is None preds = model.predict(n=2, series=series) - assert preds.static_covariates.equals(series.static_covariates) + # there seem to be some dtype issues with python=3.7 + np.testing.assert_almost_equal( + preds.static_covariates.values, + series.static_covariates.values, + ) # with `use_static_covariates=True`, static covariates are included model = model_cls(lags=4, use_static_covariates=True) @@ -780,9 +784,11 @@ def test_optional_static_covariates(self): assert model.uses_static_covariates assert model._static_covariates_shape == series.static_covariates.shape preds = model.predict(n=2, series=[series, series]) - assert all( - [p.static_covariates.equals(series.static_covariates) for p in preds] - ) + for pred in preds: + np.testing.assert_almost_equal( + pred.static_covariates.values, + series.static_covariates.values, + ) def test_static_cov_accuracy(self): """