Skip to content

Commit 864d190

Browse files
mabiltonMatthew Biltoneliane-maaloufhrzn
authored
Refactorised tabularisation + Jupyter notebook w/ experiments. (#1399)
* Refactorised tabularisation + Jupyter notebook w/ experiments. * Added 'moving window' method + refactored 'time intersection' method. * Refactoring/code simplification + bug fixes. * Added `is_training` flag. * Added tests + bug fixes. * More tests + debugging. * Fixed zero lag value not allowed bug + other debugging. * RegressionModel now calls `create_lagged_training_data` + passing tests. * ShapExplainer now uses `create_lagged_prediction_data` + minor test modifications. * Added further documentation, esp to tests. * Moved `_add_static_covariates` from `tabularization.py` to `regression_model.py`. * Static covariates refactorings. * typo fix in test_regression_models.py Co-authored-by: eliane-maalouf <[email protected]> * Removed old `_create_lagged_data` and tests notebook. * Clarification about `check_inputs` in docstring. * Allow `lags_future_covariates` to be `> 0`, and enforce `lags_past_covariates < 0`. * Made `get_feature_times` private, now `_get_feature_times`. * Placed `for` loop back inside `create_lagged_data`; more info in docstrings; test `Sequence[TimeSeries]` inputs and stochasic inputs. * Fixed `bootstrap=True` in `test_regression_models.py`. * Added note about `np.split` in `regression_model.py`. * Fixed repeated static covariates width calculation. * Fixed `shap_explainer` bug. * Amended static covariates test so that values of appended static cov columns are explicitly checked. * Updated docstring error. Co-authored-by: Matthew Bilton <[email protected]> Co-authored-by: eliane-maalouf <[email protected]> Co-authored-by: Julien Herzen <[email protected]>
1 parent 5483e2f commit 864d190

12 files changed

+7008
-467
lines changed

darts/explainability/shap_explainer.py

+12-10
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
)
3636
from darts.logging import get_logger, raise_if, raise_log
3737
from darts.models.forecasting.regression_model import RegressionModel
38-
from darts.utils.data.tabularization import _create_lagged_data
38+
from darts.utils.data.tabularization import create_lagged_prediction_data
3939
from darts.utils.utils import series2seq
4040

4141
logger = get_logger(__name__)
@@ -665,16 +665,18 @@ def _create_regression_model_shap_X(
665665
lags_past_covariates_list = self.model.lags.get("past")
666666
lags_future_covariates_list = self.model.lags.get("future")
667667

668-
X, _, indexes = _create_lagged_data(
669-
target_series,
670-
self.n,
671-
past_covariates,
672-
future_covariates,
673-
lags_list,
674-
lags_past_covariates_list,
675-
lags_future_covariates_list,
676-
is_training=False,
668+
X, indexes = create_lagged_prediction_data(
669+
target_series=target_series,
670+
past_covariates=past_covariates,
671+
future_covariates=future_covariates,
672+
lags=lags_list,
673+
lags_past_covariates=lags_past_covariates_list if past_covariates else None,
674+
lags_future_covariates=lags_future_covariates_list
675+
if future_covariates
676+
else None,
677677
)
678+
# Remove sample axis:
679+
X = X[:, :, 0]
678680

679681
if train:
680682
X = pd.DataFrame(X)

darts/models/forecasting/regression_model.py

+111-19
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from darts.logging import get_logger, raise_if, raise_if_not, raise_log
3737
from darts.models.forecasting.forecasting_model import GlobalForecastingModel
3838
from darts.timeseries import TimeSeries
39-
from darts.utils.data.tabularization import _add_static_covariates, _create_lagged_data
39+
from darts.utils.data.tabularization import create_lagged_training_data
4040
from darts.utils.multioutput import MultiOutputRegressor
4141
from darts.utils.utils import _check_quantiles, seq2series, series2seq
4242

@@ -324,7 +324,7 @@ def _create_lagged_data(
324324
lags_past_covariates = self.lags.get("past")
325325
lags_future_covariates = self.lags.get("future")
326326

327-
training_samples, training_labels, _ = _create_lagged_data(
327+
features, labels, _ = create_lagged_training_data(
328328
target_series=target_series,
329329
output_chunk_length=self.output_chunk_length,
330330
past_covariates=past_covariates,
@@ -334,20 +334,111 @@ def _create_lagged_data(
334334
lags_future_covariates=lags_future_covariates,
335335
max_samples_per_ts=max_samples_per_ts,
336336
multi_models=self.multi_models,
337+
check_inputs=False,
338+
concatenate=False,
337339
)
338340

339-
training_samples = _add_static_covariates(
340-
self,
341-
training_samples,
341+
for i, (X_i, y_i) in enumerate(zip(features, labels)):
342+
features[i] = X_i[:, :, 0]
343+
labels[i] = y_i[:, :, 0]
344+
345+
features = self._add_static_covariates(
346+
features,
342347
target_series,
343-
*self.extreme_lags,
344-
past_covariates=past_covariates,
345-
future_covariates=future_covariates,
346-
max_samples_per_ts=max_samples_per_ts,
347348
)
348349

350+
training_samples = np.concatenate(features, axis=0)
351+
training_labels = np.concatenate(labels, axis=0)
352+
349353
return training_samples, training_labels
350354

355+
def _add_static_covariates(
356+
self,
357+
features: Union[np.array, Sequence[np.array]],
358+
target_series: Union[TimeSeries, Sequence[TimeSeries]],
359+
) -> Union[np.array, Sequence[np.array]]:
360+
"""
361+
Add static covariates to the features' table for RegressionModels.
362+
Accounts for series with potentially different static covariates by padding with 0 to accomodate for the maximum
363+
number of available static_covariates in any of the given series in the sequence.
364+
365+
If no static covariates are provided for a given series, its corresponding features are padded with 0.
366+
Accounts for the case where the model is trained with series with static covariates and then used to predict
367+
on series without static covariates by padding with 0 the corresponding features of the series without
368+
static covariates.
369+
370+
Parameters
371+
----------
372+
features
373+
The features' numpy array(s) to which the static covariates will be added. Can either be a lone feature
374+
matrix or a `Sequence` of feature matrices; in the latter case, static covariates will be appended to
375+
each feature matrix in this `Sequence`.
376+
target_series
377+
The target series from which to read the static covariates.
378+
379+
Returns
380+
-------
381+
features
382+
The features' array(s) with appended static covariates columns. If the `features` input was passed as a
383+
`Sequence` of `np.array`s, then a `Sequence` is also returned; if `features` was passed as an `np.array`,
384+
a `np.array` is returned.
385+
"""
386+
387+
input_not_list = not isinstance(features, Sequence)
388+
if input_not_list:
389+
features = [features]
390+
target_series = series2seq(target_series)
391+
# collect static covariates info
392+
scovs_map = {
393+
"covs_exist": False,
394+
"vals": [], # Stores values of static cov arrays in each timeseries
395+
"sizes": {}, # Stores sizes of static cov arrays in each timeseries
396+
}
397+
for ts in target_series:
398+
if ts.has_static_covariates:
399+
scovs_map["covs_exist"] = True
400+
# Each static covariate either adds 1 extra columns or
401+
# `n_component` extra columns:
402+
vals_i = {}
403+
for name, row in ts.static_covariates.items():
404+
vals_i[name] = row
405+
scovs_map["sizes"][name] = row.size
406+
scovs_map["vals"].append(vals_i)
407+
else:
408+
scovs_map["vals"].append(None)
409+
410+
if (
411+
not scovs_map["covs_exist"]
412+
and hasattr(self.model, "n_features_in_")
413+
and (self.model.n_features_in_ is not None)
414+
and (self.model.n_features_in_ > features[0].shape[1])
415+
):
416+
# for when series in prediction do not have static covariates but some of the training series did
417+
num_static_components = self.model.n_features_in_ - features[0].shape[1]
418+
for i, features_i in enumerate(features):
419+
padding = np.zeros((features_i.shape[0], num_static_components))
420+
features[i] = np.hstack([features_i, padding])
421+
elif scovs_map["covs_exist"]:
422+
scov_width = sum(scovs_map["sizes"].values())
423+
for i, features_i in enumerate(features):
424+
vals = scovs_map["vals"][i]
425+
if vals:
426+
scov_arrays = []
427+
for name, size in scovs_map["sizes"].items():
428+
scov_arrays.append(
429+
vals[name] if name in vals else np.zeros((size,))
430+
)
431+
scov_array = np.concatenate(scov_arrays)
432+
scovs = np.broadcast_to(
433+
scov_array, (features_i.shape[0], scov_width)
434+
)
435+
else:
436+
scovs = np.zeros((features_i.shape[0], scov_width))
437+
features[i] = np.hstack([features_i, scovs])
438+
if input_not_list:
439+
features = features[0]
440+
return features
441+
351442
def _fit_model(
352443
self,
353444
target_series,
@@ -362,7 +453,10 @@ def _fit_model(
362453
"""
363454

364455
training_samples, training_labels = self._create_lagged_data(
365-
target_series, past_covariates, future_covariates, max_samples_per_ts
456+
target_series,
457+
past_covariates,
458+
future_covariates,
459+
max_samples_per_ts,
366460
)
367461

368462
# if training_labels is of shape (n_samples, 1) flatten it to shape (n_samples,)
@@ -681,15 +775,13 @@ def predict(
681775

682776
# concatenate retrieved lags
683777
X = np.concatenate(np_X, axis=1)
684-
X = _add_static_covariates(
685-
self,
686-
X,
687-
series,
688-
*self.extreme_lags,
689-
past_covariates=past_covariates,
690-
future_covariates=future_covariates,
691-
max_samples_per_ts=1,
692-
)
778+
# Need to split up `X` into three equally-sized sub-blocks
779+
# corresponding to each timeseries in `series`, so that
780+
# static covariates can be added to each block; valid since
781+
# each block contains same number of observations:
782+
X_blocks = np.split(X, len(series), axis=0)
783+
X_blocks = self._add_static_covariates(X_blocks, series)
784+
X = np.concatenate(X_blocks, axis=0)
693785

694786
# X has shape (n_series * n_samples, n_regression_features)
695787
prediction = self._predict_and_sample(X, num_samples, **kwargs)

0 commit comments

Comments
 (0)