diff --git a/.github/workflows/docs-on-pr.yml b/.github/workflows/docs-on-pr.yml index 42bc8c982..c32372d59 100644 --- a/.github/workflows/docs-on-pr.yml +++ b/.github/workflows/docs-on-pr.yml @@ -16,6 +16,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: 1.4.0 # TODO: remove after poetry fix virtualenvs-create: true virtualenvs-in-project: true - name: Load cached venv diff --git a/.github/workflows/docs-unstable.yml b/.github/workflows/docs-unstable.yml index 5dbbe59e5..11fb090da 100644 --- a/.github/workflows/docs-unstable.yml +++ b/.github/workflows/docs-unstable.yml @@ -17,6 +17,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: 1.4.0 # TODO: remove after poetry fix virtualenvs-create: true virtualenvs-in-project: true - name: Load cached venv diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index 15cb3ba4e..83ba01fce 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -26,6 +26,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: 1.4.0 # TODO: remove after poetry fix virtualenvs-create: true virtualenvs-in-project: true - name: Install dependencies diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1b4f40d09..d735cf455 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,7 +21,7 @@ jobs: - name: Install Dependencies run: | - pip install poetry + pip install poetry==1.4.0 # TODO: remove after poetry fix poetry --version poetry config virtualenvs.in-project true poetry install -E style --no-root @@ -48,6 +48,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: 1.4.0 # TODO: remove after poetry fix virtualenvs-create: true virtualenvs-in-project: true @@ -86,6 +87,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: 1.4.0 # TODO: remove after poetry fix virtualenvs-create: true virtualenvs-in-project: true @@ -123,6 +125,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: 1.4.0 # TODO: remove after poetry fix virtualenvs-create: true virtualenvs-in-project: true @@ -160,6 +163,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: 1.4.0 # TODO: remove after poetry fix virtualenvs-create: true virtualenvs-in-project: true @@ -199,6 +203,7 @@ jobs: - name: Install Poetry uses: snok/install-poetry@v1 with: + version: 1.4.0 # TODO: remove after poetry fix virtualenvs-create: true virtualenvs-in-project: true diff --git a/CHANGELOG.md b/CHANGELOG.md index e5a23c23e..4e67592fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,18 +7,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased ### Added -- +- Add `refit` parameter into `backtest` ([#1159](https://github.com/tinkoff-ai/etna/pull/1159)) +- Add `stride` parameter into `backtest` ([#1165](https://github.com/tinkoff-ai/etna/pull/1165)) +- Add optional parameter `ts` into `forecast` method of pipelines ([#1071](https://github.com/tinkoff-ai/etna/pull/1071)) +- Add tests on `transform` method of transforms on subset of segments, on new segments, on future with gap ([#1094](https://github.com/tinkoff-ai/etna/pull/1094)) +- Add tests on `inverse_transform` method of transforms on subset of segments, on new segments, on future with gap ([#1127](https://github.com/tinkoff-ai/etna/pull/1127)) - - - - ### Changed -- +- Add more scenarios into tests for models ([#1082](https://github.com/tinkoff-ai/etna/pull/1082)) +- Decouple `SeasonalMovingAverageModel` from `PerSegmentModelMixin` ([#1132](https://github.com/tinkoff-ai/etna/pull/1132)) +- Decouple `DeadlineMovingAverageModel` from `PerSegmentModelMixin` ([#1140](https://github.com/tinkoff-ai/etna/pull/1140)) - - - ### Fixed -- +- Fix inference tests on new segments for `DeepARModel` and `TFTModel` ([#1109](https://github.com/tinkoff-ai/etna/pull/1109)) +- Fix alignment during forecasting in new NNs, add validation of context size during forecasting in new NNs, add validation of batch in `MLPNet` ([#1108](https://github.com/tinkoff-ai/etna/pull/1108)) +- Fix `MeanSegmentEncoderTransform` to work with subset of segments and raise error on new segments ([#1104](https://github.com/tinkoff-ai/etna/pull/1104)) +- Fix outliers transforms on future with gap ([#1147](https://github.com/tinkoff-ai/etna/pull/1147)) +- Fix `SegmentEncoderTransform` to work with subset of segments and raise error on new segments ([#1103](https://github.com/tinkoff-ai/etna/pull/1103)) +- Fix `SklearnTransform` in per-segment mode to work on subset of segments and raise error on new segments ([#1107](https://github.com/tinkoff-ai/etna/pull/1107)) +- Fix `OutliersTransform` and its children to raise error on new segments ([#1139](https://github.com/tinkoff-ai/etna/pull/1139)) +- Fix `DifferencingTransform` to raise error on new segments during `transform` and `inverse_transform` in inplace mode ([#1141](https://github.com/tinkoff-ai/etna/pull/1141)) +- Teach `DifferencingTransform` to `inverse_transform` with NaNs ([#1155](https://github.com/tinkoff-ai/etna/pull/1155)) - - - diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst index a1d1baf57..9744c9b08 100644 --- a/docs/source/tutorials.rst +++ b/docs/source/tutorials.rst @@ -17,3 +17,4 @@ Tutorials tutorials/NN_examples tutorials/classification tutorials/hierarchical_pipeline + tutorials/inference diff --git a/etna/ensembles/direct_ensemble.py b/etna/ensembles/direct_ensemble.py index 4b20c7e50..f5203ffb6 100644 --- a/etna/ensembles/direct_ensemble.py +++ b/etna/ensembles/direct_ensemble.py @@ -122,17 +122,14 @@ def _merge(self, forecasts: List[TSDataset]) -> TSDataset: forecast_dataset = TSDataset(df=forecast_df, freq=forecasts[0].freq) return forecast_dataset - def _forecast(self) -> TSDataset: + def _forecast(self, ts: TSDataset) -> TSDataset: """Make predictions. In each point in the future, forecast of the ensemble is forecast of base pipeline with the shortest horizon, which covers this point. """ - if self.ts is None: - raise ValueError("Something went wrong, ts is None!") - forecasts = Parallel(n_jobs=self.n_jobs, backend="multiprocessing", verbose=11)( - delayed(self._forecast_pipeline)(pipeline=pipeline) for pipeline in self.pipelines + delayed(self._forecast_pipeline)(pipeline=pipeline, ts=ts) for pipeline in self.pipelines ) forecast = self._merge(forecasts=forecasts) return forecast diff --git a/etna/ensembles/mixins.py b/etna/ensembles/mixins.py index 256ddd602..81e8f02c9 100644 --- a/etna/ensembles/mixins.py +++ b/etna/ensembles/mixins.py @@ -41,10 +41,10 @@ def _fit_pipeline(pipeline: BasePipeline, ts: TSDataset) -> BasePipeline: return pipeline @staticmethod - def _forecast_pipeline(pipeline: BasePipeline) -> TSDataset: + def _forecast_pipeline(pipeline: BasePipeline, ts: TSDataset) -> TSDataset: """Make forecast with given pipeline.""" tslogger.log(msg=f"Start forecasting with {pipeline}.") - forecast = pipeline.forecast() + forecast = pipeline.forecast(ts=ts) tslogger.log(msg=f"Forecast is done with {pipeline}.") return forecast diff --git a/etna/ensembles/stacking_ensemble.py b/etna/ensembles/stacking_ensemble.py index db5fd31e5..8d9e98033 100644 --- a/etna/ensembles/stacking_ensemble.py +++ b/etna/ensembles/stacking_ensemble.py @@ -8,7 +8,6 @@ from typing import Set from typing import Tuple from typing import Union -from typing import cast import numpy as np import pandas as pd @@ -160,7 +159,7 @@ def fit(self, ts: TSDataset) -> "StackingEnsemble": # Fit the final model self.filtered_features_for_final_model = self._filter_features_to_use(forecasts) - x, y = self._make_features(forecasts=forecasts, train=True) + x, y = self._make_features(ts=self.ts, forecasts=forecasts, train=True) self.final_model.fit(x, y) # Fit the base models @@ -170,12 +169,9 @@ def fit(self, ts: TSDataset) -> "StackingEnsemble": return self def _make_features( - self, forecasts: List[TSDataset], train: bool = False + self, ts: TSDataset, forecasts: List[TSDataset], train: bool = False ) -> Tuple[pd.DataFrame, Optional[pd.Series]]: """Prepare features for the ``final_model``.""" - if self.ts is None: - raise ValueError("StackingEnsemble is not fitted! Fit the StackingEnsemble before calling forecast method.") - # Stack targets from the forecasts targets = [ forecast[:, :, "target"].rename({"target": f"regressor_target_{i}"}, axis=1) @@ -201,29 +197,25 @@ def _make_features( features_df = pd.concat([features, targets], axis=1) # Flatten the features to fit the sklearn interface - x = pd.concat([features_df.loc[:, segment] for segment in self.ts.segments], axis=0) + x = pd.concat([features_df.loc[:, segment] for segment in ts.segments], axis=0) if train: y = pd.concat( - [ - self.ts[forecasts[0].index.min() : forecasts[0].index.max(), segment, "target"] - for segment in self.ts.segments - ], + [ts[forecasts[0].index.min() : forecasts[0].index.max(), segment, "target"] for segment in ts.segments], axis=0, ) return x, y else: return x, None - def _process_forecasts(self, forecasts: List[TSDataset]) -> TSDataset: - x, _ = self._make_features(forecasts=forecasts, train=False) - self.ts = cast(TSDataset, self.ts) + def _process_forecasts(self, ts: TSDataset, forecasts: List[TSDataset]) -> TSDataset: + x, _ = self._make_features(ts=ts, forecasts=forecasts, train=False) y = self.final_model.predict(x) num_segments = len(forecasts[0].segments) y = y.reshape(num_segments, -1).T num_timestamps = y.shape[0] # Format the forecast into TSDataset - segment_col = [segment for segment in self.ts.segments for _ in range(num_timestamps)] + segment_col = [segment for segment in ts.segments for _ in range(num_timestamps)] x.loc[:, "segment"] = segment_col x.loc[:, "timestamp"] = x.index.values df_exog = TSDataset.to_dataset(x) @@ -231,21 +223,19 @@ def _process_forecasts(self, forecasts: List[TSDataset]) -> TSDataset: df = forecasts[0][:, :, "target"].copy() df.loc[pd.IndexSlice[:], pd.IndexSlice[:, "target"]] = np.NAN - result = TSDataset(df=df, freq=self.ts.freq, df_exog=df_exog) + result = TSDataset(df=df, freq=ts.freq, df_exog=df_exog) result.loc[pd.IndexSlice[:], pd.IndexSlice[:, "target"]] = y return result - def _forecast(self) -> TSDataset: + def _forecast(self, ts: TSDataset) -> TSDataset: """Make predictions. Compute the combination of pipelines' forecasts using ``final_model`` """ - if self.ts is None: - raise ValueError("Something went wrong, ts is None!") forecasts = Parallel(n_jobs=self.n_jobs, **self.joblib_params)( - delayed(self._forecast_pipeline)(pipeline=pipeline) for pipeline in self.pipelines + delayed(self._forecast_pipeline)(pipeline=pipeline, ts=ts) for pipeline in self.pipelines ) - forecast = self._process_forecasts(forecasts=forecasts) + forecast = self._process_forecasts(ts=ts, forecasts=forecasts) return forecast def _predict( @@ -265,5 +255,5 @@ def _predict( ) for pipeline in self.pipelines ) - prediction = self._process_forecasts(forecasts=predictions) + prediction = self._process_forecasts(ts=ts, forecasts=predictions) return prediction diff --git a/etna/ensembles/voting_ensemble.py b/etna/ensembles/voting_ensemble.py index effc1edfa..eb42c4219 100644 --- a/etna/ensembles/voting_ensemble.py +++ b/etna/ensembles/voting_ensemble.py @@ -5,7 +5,6 @@ from typing import Optional from typing import Sequence from typing import Union -from typing import cast import pandas as pd from joblib import Parallel @@ -199,16 +198,13 @@ def _vote(self, forecasts: List[TSDataset]) -> TSDataset: forecast_dataset = TSDataset(df=forecast_df, freq=forecasts[0].freq) return forecast_dataset - def _forecast(self) -> TSDataset: + def _forecast(self, ts: TSDataset) -> TSDataset: """Make predictions. Compute weighted average of pipelines' forecasts """ - if self.ts is None: - raise ValueError("Something went wrong, ts is None!") - forecasts = Parallel(n_jobs=self.n_jobs, backend="multiprocessing", verbose=11)( - delayed(self._forecast_pipeline)(pipeline=pipeline) for pipeline in self.pipelines + delayed(self._forecast_pipeline)(pipeline=pipeline, ts=ts) for pipeline in self.pipelines ) forecast = self._vote(forecasts=forecasts) return forecast @@ -224,7 +220,6 @@ def _predict( if prediction_interval: raise NotImplementedError(f"Ensemble {self.__class__.__name__} doesn't support prediction intervals!") - self.ts = cast(TSDataset, self.ts) predictions = Parallel(n_jobs=self.n_jobs, backend="multiprocessing", verbose=11)( delayed(self._predict_pipeline)( ts=ts, pipeline=pipeline, start_timestamp=start_timestamp, end_timestamp=end_timestamp diff --git a/etna/models/base.py b/etna/models/base.py index abfa20b4d..cf307de53 100644 --- a/etna/models/base.py +++ b/etna/models/base.py @@ -622,6 +622,12 @@ def forecast(self, ts: "TSDataset", prediction_size: int) -> "TSDataset": : Dataset with predictions """ + expected_length = prediction_size + self.encoder_length + if len(ts.index) < expected_length: + raise ValueError( + "Given context isn't big enough, try to decrease context_size, prediction_size or increase length of given dataset!" + ) + test_dataset = ts.to_torch_dataset( make_samples=functools.partial( self.net.make_samples, encoder_length=self.encoder_length, decoder_length=prediction_size @@ -629,9 +635,11 @@ def forecast(self, ts: "TSDataset", prediction_size: int) -> "TSDataset": dropna=False, ) predictions = self.raw_predict(test_dataset) - future_ts = ts.tsdataset_idx_slice(start_idx=self.encoder_length, end_idx=self.encoder_length + prediction_size) + end_idx = len(ts.index) + future_ts = ts.tsdataset_idx_slice(start_idx=end_idx - prediction_size, end_idx=end_idx) for (segment, feature_nm), value in predictions.items(): - future_ts.df.loc[:, pd.IndexSlice[segment, feature_nm]] = value[:prediction_size, :] + # we don't want to change dtype after assignment, but there can happen cast to float32 + future_ts.df.loc[:, pd.IndexSlice[segment, feature_nm]] = value[:prediction_size, :].astype(np.float64) future_ts.inverse_transform() diff --git a/etna/models/deadline_ma.py b/etna/models/deadline_ma.py index 8549d4c68..003c2c480 100644 --- a/etna/models/deadline_ma.py +++ b/etna/models/deadline_ma.py @@ -1,14 +1,12 @@ import warnings from enum import Enum -from typing import Dict -from typing import List +from typing import Optional import numpy as np import pandas as pd +from etna.datasets import TSDataset from etna.models.base import NonPredictionIntervalContextRequiredAbstractModel -from etna.models.mixins import NonPredictionIntervalContextRequiredModelMixin -from etna.models.mixins import PerSegmentModelMixin class SeasonalityMode(Enum): @@ -24,76 +22,99 @@ def _missing_(cls, value): ) -class _DeadlineMovingAverageModel: +class DeadlineMovingAverageModel( + NonPredictionIntervalContextRequiredAbstractModel, +): """Moving average model that uses exact previous dates to predict.""" def __init__(self, window: int = 3, seasonality: str = "month"): - """ - Initialize deadline moving average model. + """Initialize deadline moving average model. Length of the context is equal to the number of ``window`` months or years, depending on the ``seasonality``. Parameters ---------- - window: int + window: Number of values taken for forecast for each point. - seasonality: str + seasonality: Only allowed monthly or annual seasonality. """ - self.name = "target" self.window = window self.seasonality = SeasonalityMode(seasonality) - self.freqs_available = {"H", "D"} - self._freq = None + self._freqs_available = {"H", "D"} + self._freq: Optional[str] = None + + def _validate_fitted(self): + """Check if model is fitted.""" + if self._freq is None: + raise ValueError("Model is not fitted! Fit the model before trying the find out context size!") + + @property + def context_size(self) -> int: + """Upper bound to context size of the model.""" + self._validate_fitted() + + cur_value = None + if self.seasonality is SeasonalityMode.year: + cur_value = 366 + elif self.seasonality is SeasonalityMode.month: + cur_value = 31 - def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_DeadlineMovingAverageModel": + if self._freq == "H": + cur_value *= 24 + + cur_value *= self.window + + return cur_value + + def get_model(self) -> "DeadlineMovingAverageModel": + """Get internal model. + + Returns + ------- + : + Itself """ - Fit DeadlineMovingAverageModel model. + return self + + def fit(self, ts: TSDataset) -> "DeadlineMovingAverageModel": + """Fit model. Parameters ---------- - df: pd.DataFrame - Data to fit on - regressors: - List of the columns with regressors(ignored in this model) - - Raises - ------ - ValueError - If freq of dataframe is not supported - ValueError - If series is too short for chosen shift value + ts: + Dataset with features Returns ------- : - Fitted model + Model after fit """ - freq = pd.infer_freq(df["timestamp"]) - if freq not in self.freqs_available: - raise ValueError(f"{freq} is not supported! Use daily or hourly frequency!") + # we make a normalization to treat "1d" like "D" + freq = pd.tseries.frequencies.to_offset(ts.freq).freqstr + if freq not in self._freqs_available: + raise ValueError(f"Freq {freq} is not supported! Use daily or hourly frequency!") + + self._freq = freq - if set(df.columns) != {"timestamp", "target"}: + columns = set(ts.columns.get_level_values("feature")) + if columns != {"target"}: warnings.warn( message=f"{type(self).__name__} does not work with any exogenous series or features. " f"It uses only target series for predict/\n " ) - - self._freq = freq - return self @staticmethod def _get_context_beginning( df: pd.DataFrame, prediction_size: int, seasonality: SeasonalityMode, window: int ) -> pd.Timestamp: - """ - Get timestamp where context begins. + """Get timestamp where context begins. Parameters ---------- df: - Time series in a long format. + Time series in a wide format. prediction_size: Number of last timestamps to leave after making prediction. Previous timestamps will be used as a context for models that require it. @@ -113,29 +134,31 @@ def _get_context_beginning( if context isn't big enough """ df_history = df.iloc[:-prediction_size] - history_timestamps = df_history["timestamp"] - future_timestamps = df["timestamp"].iloc[-prediction_size:] + history_timestamps = df_history.index + future_timestamps = df.iloc[-prediction_size:].index # if we have len(history_timestamps) == 0, then len(df) <= prediction_size if len(history_timestamps) == 0: raise ValueError( - "Given context isn't big enough, try to decrease context_size, prediction_size of increase length of given dataframe!" + "Given context isn't big enough, try to decrease context_size, prediction_size or increase length of given dataframe!" ) if seasonality is SeasonalityMode.month: - first_index = future_timestamps.iloc[0] - pd.DateOffset(months=window) + first_index = future_timestamps[0] - pd.DateOffset(months=window) elif seasonality is SeasonalityMode.year: - first_index = future_timestamps.iloc[0] - pd.DateOffset(years=window) + first_index = future_timestamps[0] - pd.DateOffset(years=window) - if first_index < history_timestamps.iloc[0]: + if first_index < history_timestamps[0]: raise ValueError( - "Given context isn't big enough, try to decrease context_size, prediction_size of increase length of given dataframe!" + "Given context isn't big enough, try to decrease context_size, prediction_size or increase length of given dataframe!" ) return first_index - def _make_predictions(self, result_template: pd.Series, context: pd.Series, prediction_size: int) -> np.ndarray: + def _make_predictions( + self, result_template: pd.DataFrame, context: pd.DataFrame, prediction_size: int + ) -> np.ndarray: """Make predictions using ``result_template`` as a base and ``context`` as a context.""" index = result_template.index start_idx = len(result_template) - prediction_size @@ -154,149 +177,117 @@ def _make_predictions(self, result_template: pd.Series, context: pd.Series, pred result_values = result_template.values[-prediction_size:] return result_values - def forecast(self, df: pd.DataFrame, prediction_size: int) -> np.ndarray: - """Compute autoregressive forecasts. - - Parameters - ---------- - df: - Features dataframe. - prediction_size: - Number of last timestamps to leave after making prediction. - Previous timestamps will be used as a context for models that require it. - - Returns - ------- - : - Array with predictions. - - Raises - ------ - ValueError: - if context isn't big enough - ValueError: - if forecast context contains NaNs - """ + def _forecast(self, df: pd.DataFrame, prediction_size: int) -> pd.DataFrame: + """Make autoregressive forecasts on a wide dataframe.""" context_beginning = self._get_context_beginning( df=df, prediction_size=prediction_size, seasonality=self.seasonality, window=self.window ) - df = df.set_index("timestamp") - df_history = df.iloc[:-prediction_size] - history = df_history["target"] - history = history[history.index >= context_beginning] + history = df.loc[:, pd.IndexSlice[:, "target"]] + history = history.iloc[:-prediction_size] + history = history.loc[history.index >= context_beginning] if np.any(history.isnull()): raise ValueError("There are NaNs in a forecast context, forecast method requires context to be filled!") + num_segments = history.shape[1] index = pd.date_range(start=context_beginning, end=df.index[-1], freq=self._freq) - result_template = np.append(history.values, np.zeros(prediction_size)) - result_template = pd.Series(result_template, index=index) + result_template = np.append(history.values, np.zeros((prediction_size, num_segments)), axis=0) + result_template = pd.DataFrame(result_template, index=index, columns=history.columns) result_values = self._make_predictions( result_template=result_template, context=result_template, prediction_size=prediction_size ) - return result_values - def predict(self, df: pd.DataFrame, prediction_size: int) -> np.ndarray: - """Compute predictions using true target data as context. + df = df.iloc[-prediction_size:] + y_pred = result_values[-prediction_size:] + df.loc[:, pd.IndexSlice[:, "target"]] = y_pred + return df + + def forecast(self, ts: TSDataset, prediction_size: int) -> TSDataset: + """Make autoregressive forecasts. Parameters ---------- - df: - Features dataframe. + ts: + Dataset with features prediction_size: Number of last timestamps to leave after making prediction. - Previous timestamps will be used as a context for models that require it. + Previous timestamps will be used as a context. Returns ------- : - Array with predictions. + Dataset with predictions Raises ------ + ValueError: + if model isn't fitted ValueError: if context isn't big enough ValueError: - if there are NaNs in a target column on timestamps that are required to make predictions + if forecast context contains NaNs """ + self._validate_fitted() + df = ts.to_pandas() + new_df = self._forecast(df=df, prediction_size=prediction_size) + ts.df = new_df + ts.inverse_transform() + return ts + + def _predict(self, df: pd.DataFrame, prediction_size: int) -> pd.DataFrame: + """Make predictions on a wide dataframe using true values as autoregression context.""" context_beginning = self._get_context_beginning( df=df, prediction_size=prediction_size, seasonality=self.seasonality, window=self.window ) - df = df.set_index("timestamp") - context = df["target"] - context = context[context.index >= context_beginning] - if np.any(np.isnan(context)): + context = df.loc[:, pd.IndexSlice[:, "target"]] + context = context.loc[context.index >= context_beginning] + if np.any(context.isnull()): raise ValueError("There are NaNs in a target column, predict method requires target to be filled!") + num_segments = context.shape[1] index = pd.date_range(start=df.index[-prediction_size], end=df.index[-1], freq=self._freq) - result_template = pd.Series(np.zeros(prediction_size), index=index) + result_template = pd.DataFrame(np.zeros((prediction_size, num_segments)), index=index, columns=context.columns) result_values = self._make_predictions( result_template=result_template, context=context, prediction_size=prediction_size ) - return result_values - - @property - def context_size(self) -> int: - """Upper bound to context size of the model.""" - cur_value = None - if self.seasonality is SeasonalityMode.year: - cur_value = 366 - elif self.seasonality is SeasonalityMode.month: - cur_value = 31 - - if self._freq is None: - raise ValueError("Model is not fitted! Fit the model before trying the find out context size!") - if self._freq == "H": - cur_value *= 24 - - cur_value *= self.window - - return cur_value - -class DeadlineMovingAverageModel( - PerSegmentModelMixin, - NonPredictionIntervalContextRequiredModelMixin, - NonPredictionIntervalContextRequiredAbstractModel, -): - """Moving average model that uses exact previous dates to predict.""" + df = df.iloc[-prediction_size:] + y_pred = result_values[-prediction_size:] + df.loc[:, pd.IndexSlice[:, "target"]] = y_pred + return df - def __init__(self, window: int = 3, seasonality: str = "month"): - """ - Initialize deadline moving average model. - - Length of the context is equal to the number of ``window`` months or years, depending on the ``seasonality``. + def predict(self, ts: TSDataset, prediction_size: int) -> TSDataset: + """Make predictions using true values as autoregression context (teacher forcing). Parameters ---------- - window: int - Number of values taken for forecast for each point. - seasonality: str - Only allowed monthly or annual seasonality. - """ - self.window = window - self.seasonality = seasonality - super(DeadlineMovingAverageModel, self).__init__( - base_model=_DeadlineMovingAverageModel(window=window, seasonality=seasonality) - ) - - @property - def context_size(self) -> int: - """Upper bound to context size of the model.""" - models = self.get_model() - model = next(iter(models.values())) - return model.context_size - - def get_model(self) -> Dict[str, "DeadlineMovingAverageModel"]: - """Get internal model. + ts: + Dataset with features + prediction_size: + Number of last timestamps to leave after making prediction. + Previous timestamps will be used as a context. Returns ------- : - Internal model + Dataset with predictions + + Raises + ------ + ValueError: + if model isn't fitted + ValueError: + if context isn't big enough + ValueError: + if forecast context contains NaNs """ - return self._get_model() + self._validate_fitted() + df = ts.to_pandas() + new_df = self._predict(df=df, prediction_size=prediction_size) + ts.df = new_df + ts.inverse_transform() + return ts __all__ = ["DeadlineMovingAverageModel"] diff --git a/etna/models/mixins.py b/etna/models/mixins.py index 76b3659cb..f1cdf005f 100644 --- a/etna/models/mixins.py +++ b/etna/models/mixins.py @@ -324,11 +324,14 @@ def _make_predictions(self, ts: TSDataset, prediction_method: Callable, **kwargs """ result_list = list() df = ts.to_pandas() - for segment, model in self._get_model().items(): + models = self._get_model() + for segment in ts.segments: + if segment not in models: + raise NotImplementedError("Per-segment models can't make predictions on new segments!") + segment_model = models[segment] segment_predict = self._make_predictions_segment( - model=model, segment=segment, df=df, prediction_method=prediction_method, **kwargs + model=segment_model, segment=segment, df=df, prediction_method=prediction_method, **kwargs ) - result_list.append(segment_predict) result_df = pd.concat(result_list, ignore_index=True) diff --git a/etna/models/nn/mlp.py b/etna/models/nn/mlp.py index 3887b58a8..034b1b252 100644 --- a/etna/models/nn/mlp.py +++ b/etna/models/nn/mlp.py @@ -66,6 +66,11 @@ def __init__( layers.append(nn.Linear(in_features=hidden_size[-1], out_features=1)) self.mlp = nn.Sequential(*layers) + @staticmethod + def _validate_batch(batch: MLPBatch): + if batch["decoder_real"].isnan().sum().item(): + raise ValueError("There are NaNs in features, this model can't work with them!") + def forward(self, batch: MLPBatch): # type: ignore """Forward pass. @@ -78,6 +83,7 @@ def forward(self, batch: MLPBatch): # type: ignore : forecast """ + self._validate_batch(batch) decoder_real = batch["decoder_real"].float() return self.mlp(decoder_real) @@ -93,6 +99,7 @@ def step(self, batch: MLPBatch, *args, **kwargs): # type: ignore : loss, true_target, prediction_target """ + self._validate_batch(batch) decoder_real = batch["decoder_real"].float() decoder_target = batch["decoder_target"].float() diff --git a/etna/models/seasonal_ma.py b/etna/models/seasonal_ma.py index 99fc8d588..85c070821 100644 --- a/etna/models/seasonal_ma.py +++ b/etna/models/seasonal_ma.py @@ -1,18 +1,16 @@ import warnings -from typing import Dict -from typing import List import numpy as np import pandas as pd +from etna.datasets import TSDataset from etna.models.base import NonPredictionIntervalContextRequiredAbstractModel -from etna.models.mixins import NonPredictionIntervalContextRequiredModelMixin -from etna.models.mixins import PerSegmentModelMixin -class _SeasonalMovingAverageModel: - """ - Seasonal moving average. +class SeasonalMovingAverageModel( + NonPredictionIntervalContextRequiredAbstractModel, +): + """Seasonal moving average. .. math:: y_{t} = \\frac{\\sum_{i=1}^{n} y_{t-is} }{n}, @@ -28,165 +26,158 @@ def __init__(self, window: int = 5, seasonality: int = 7): Parameters ---------- - window: int + window: Number of values taken for forecast for each point. - seasonality: int + seasonality: Lag between values taken for forecast. """ - self.name = "target" self.window = window self.seasonality = seasonality - self.shift = self.window * self.seasonality - def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_SeasonalMovingAverageModel": - """ - Fit SeasonalMovingAverage model. + @property + def context_size(self) -> int: + """Context size of the model.""" + return self.window * self.seasonality - Parameters - ---------- - df: - Data to fit on - regressors: - List of the columns with regressors(ignored in this model) + def get_model(self) -> "SeasonalMovingAverageModel": + """Get internal model. Returns ------- : - Fitted model + Itself """ - if set(df.columns) != {"timestamp", "target"}: - warnings.warn( - message=f"{type(self).__name__} does not work with any exogenous series or features. " - f"It uses only target series for predict/\n " - ) - return self - def forecast(self, df: pd.DataFrame, prediction_size: int) -> np.ndarray: - """Compute autoregressive forecasts. + def fit(self, ts: TSDataset) -> "SeasonalMovingAverageModel": + """Fit model. + + For this model, fit does nothing. Parameters ---------- - df: - Features dataframe. - prediction_size: - Number of last timestamps to leave after making prediction. - Previous timestamps will be used as a context for models that require it. + ts: + Dataset with features Returns ------- : - Array with predictions. - - Raises - ------ - ValueError: - if context isn't big enough - ValueError: - if forecast context contains NaNs + Model after fit """ - expected_length = prediction_size + self.shift + columns = set(ts.columns.get_level_values("feature")) + if columns != {"target"}: + warnings.warn( + message=f"{type(self).__name__} does not work with any exogenous series or features. " + f"It uses only target series for predict/\n " + ) + return self + + def _validate_context(self, df: pd.DataFrame, prediction_size: int): + """Validate that we have enough context to make prediction with given parameters.""" + expected_length = prediction_size + self.context_size + if len(df) < expected_length: raise ValueError( - "Given context isn't big enough, try to decrease context_size, prediction_size of increase length of given dataframe!" + "Given context isn't big enough, try to decrease context_size, prediction_size or increase length of given dataframe!" ) - history = df["target"][-expected_length:-prediction_size] - if np.any(history.isnull()): + def _forecast(self, df: pd.DataFrame, prediction_size: int) -> pd.DataFrame: + """Make autoregressive forecasts on a wide dataframe.""" + self._validate_context(df=df, prediction_size=prediction_size) + + expected_length = prediction_size + self.context_size + history = df.loc[:, pd.IndexSlice[:, "target"]].values + history = history[-expected_length:-prediction_size] + if np.any(np.isnan(history)): raise ValueError("There are NaNs in a forecast context, forecast method requires context to be filled!") - res = np.append(history, np.zeros(prediction_size)) - for i in range(self.shift, len(res)): - res[i] = res[i - self.shift : i : self.seasonality].mean() + num_segments = history.shape[1] + res = np.append(history, np.zeros((prediction_size, num_segments)), axis=0) + for i in range(self.context_size, len(res)): + res[i] = res[i - self.context_size : i : self.seasonality].mean(axis=0) + + df = df.iloc[-prediction_size:] y_pred = res[-prediction_size:] - return y_pred + df.loc[:, pd.IndexSlice[:, "target"]] = y_pred + return df - def predict(self, df: pd.DataFrame, prediction_size: int) -> np.ndarray: - """Compute predictions using true target data as context. + def forecast(self, ts: TSDataset, prediction_size: int) -> TSDataset: + """Make autoregressive forecasts. Parameters ---------- - df: - Features dataframe. + ts: + Dataset with features prediction_size: Number of last timestamps to leave after making prediction. - Previous timestamps will be used as a context for models that require it. + Previous timestamps will be used as a context. Returns ------- : - Array with predictions. + Dataset with predictions Raises ------ ValueError: if context isn't big enough ValueError: - if there are NaNs in a target column on timestamps that are required to make predictions + if forecast context contains NaNs """ - expected_length = prediction_size + self.shift - if len(df) < expected_length: - raise ValueError( - "Given context isn't big enough, try to decrease context_size, prediction_size of increase length of given dataframe!" - ) - - context = df["target"][-expected_length:].values + df = ts.to_pandas() + new_df = self._forecast(df=df, prediction_size=prediction_size) + ts.df = new_df + ts.inverse_transform() + return ts + + def _predict(self, df: pd.DataFrame, prediction_size: int) -> pd.DataFrame: + """Make predictions on a wide dataframe using true values as autoregression context.""" + self._validate_context(df=df, prediction_size=prediction_size) + + expected_length = prediction_size + self.context_size + context = df.loc[:, pd.IndexSlice[:, "target"]].values + context = context[-expected_length:] if np.any(np.isnan(context)): raise ValueError("There are NaNs in a target column, predict method requires target to be filled!") - res = np.zeros(prediction_size) - for res_idx, context_idx in enumerate(range(self.shift, len(context))): - res[res_idx] = context[context_idx - self.shift : context_idx : self.seasonality].mean() - return res - - -class SeasonalMovingAverageModel( - PerSegmentModelMixin, - NonPredictionIntervalContextRequiredModelMixin, - NonPredictionIntervalContextRequiredAbstractModel, -): - """ - Seasonal moving average. - - .. math:: - y_{t} = \\frac{\\sum_{i=1}^{n} y_{t-is} }{n}, - - where :math:`s` is seasonality, :math:`n` is window size (how many history values are taken for forecast). - """ + num_segments = context.shape[1] + res = np.zeros((prediction_size, num_segments)) + for res_idx, context_idx in enumerate(range(self.context_size, len(context))): + res[res_idx] = context[context_idx - self.context_size : context_idx : self.seasonality].mean(axis=0) - def __init__(self, window: int = 5, seasonality: int = 7): - """ - Initialize seasonal moving average model. + df = df.iloc[-prediction_size:] + y_pred = res[-prediction_size:] + df.loc[:, pd.IndexSlice[:, "target"]] = y_pred + return df - Length of the context is ``window * seasonality``. + def predict(self, ts: TSDataset, prediction_size: int) -> TSDataset: + """Make predictions using true values as autoregression context (teacher forcing). Parameters ---------- - window: int - Number of values taken for forecast for each point. - seasonality: int - Lag between values taken for forecast. - """ - self.window = window - self.seasonality = seasonality - super(SeasonalMovingAverageModel, self).__init__( - base_model=_SeasonalMovingAverageModel(window=window, seasonality=seasonality) - ) - - @property - def context_size(self) -> int: - """Context size of the model.""" - return self.window * self.seasonality - - def get_model(self) -> Dict[str, "SeasonalMovingAverageModel"]: - """Get internal model. + ts: + Dataset with features + prediction_size: + Number of last timestamps to leave after making prediction. + Previous timestamps will be used as a context. Returns ------- : - Internal model + Dataset with predictions + + Raises + ------ + ValueError: + if context isn't big enough + ValueError: + if forecast context contains NaNs """ - return self._get_model() + df = ts.to_pandas() + new_df = self._predict(df=df, prediction_size=prediction_size) + ts.df = new_df + ts.inverse_transform() + return ts __all__ = ["SeasonalMovingAverageModel"] diff --git a/etna/pipeline/autoregressive_pipeline.py b/etna/pipeline/autoregressive_pipeline.py index a2f13772a..4a1a3ff49 100644 --- a/etna/pipeline/autoregressive_pipeline.py +++ b/etna/pipeline/autoregressive_pipeline.py @@ -96,34 +96,28 @@ def fit(self, ts: TSDataset) -> "AutoRegressivePipeline": self.ts.inverse_transform() return self - def _create_predictions_template(self) -> pd.DataFrame: + def _create_predictions_template(self, ts: TSDataset) -> pd.DataFrame: """Create dataframe to fill with forecasts.""" - if self.ts is None: - raise ValueError( - "AutoRegressivePipeline is not fitted! Fit the AutoRegressivePipeline before calling forecast method." - ) - prediction_df = self.ts[:, :, "target"] + prediction_df = ts[:, :, "target"] future_dates = pd.date_range( - start=prediction_df.index.max(), periods=self.horizon + 1, freq=self.ts.freq, closed="right" + start=prediction_df.index.max(), periods=self.horizon + 1, freq=ts.freq, closed="right" ) prediction_df = prediction_df.reindex(prediction_df.index.append(future_dates)) prediction_df.index.name = "timestamp" return prediction_df - def _forecast(self) -> TSDataset: + def _forecast(self, ts: TSDataset) -> TSDataset: """Make predictions.""" - if self.ts is None: - raise ValueError("Something went wrong, ts is None!") - prediction_df = self._create_predictions_template() + prediction_df = self._create_predictions_template(ts) for idx_start in range(0, self.horizon, self.step): current_step = min(self.step, self.horizon - idx_start) - current_idx_border = self.ts.index.shape[0] + idx_start + current_idx_border = ts.index.shape[0] + idx_start current_ts = TSDataset( df=prediction_df.iloc[:current_idx_border], - freq=self.ts.freq, - df_exog=self.ts.df_exog, - known_future=self.ts.known_future, + freq=ts.freq, + df_exog=ts.df_exog, + known_future=ts.known_future, ) # manually set transforms in current_ts, otherwise make_future won't know about them current_ts.transforms = self.transforms @@ -152,9 +146,7 @@ def _forecast(self) -> TSDataset: prediction_df = prediction_df.combine_first(current_ts_future.to_pandas()[prediction_df.columns]) # construct dataset and add all features - prediction_ts = TSDataset( - df=prediction_df, freq=self.ts.freq, df_exog=self.ts.df_exog, known_future=self.ts.known_future - ) + prediction_ts = TSDataset(df=prediction_df, freq=ts.freq, df_exog=ts.df_exog, known_future=ts.known_future) prediction_ts.transform(self.transforms) prediction_ts.inverse_transform() # cut only last timestamps from result dataset diff --git a/etna/pipeline/base.py b/etna/pipeline/base.py index 1bc5b7c5f..4e6897384 100644 --- a/etna/pipeline/base.py +++ b/etna/pipeline/base.py @@ -1,3 +1,4 @@ +import math from abc import abstractmethod from copy import deepcopy from enum import Enum @@ -15,6 +16,8 @@ from joblib import Parallel from joblib import delayed from scipy.stats import norm +from typing_extensions import TypedDict +from typing_extensions import assert_never from etna.core import AbstractSaveable from etna.core import BaseMixin @@ -33,6 +36,12 @@ class CrossValidationMode(Enum): expand = "expand" constant = "constant" + @classmethod + def _missing_(cls, value): + raise NotImplementedError( + f"{value} is not a valid {cls.__name__}. Only {', '.join([repr(m.value) for m in cls])} modes allowed" + ) + class FoldMask(BaseMixin): """Container to hold the description of the fold mask. @@ -130,12 +139,20 @@ def fit(self, ts: TSDataset) -> "AbstractPipeline": @abstractmethod def forecast( - self, prediction_interval: bool = False, quantiles: Sequence[float] = (0.025, 0.975), n_folds: int = 3 + self, + ts: Optional[TSDataset] = None, + prediction_interval: bool = False, + quantiles: Sequence[float] = (0.025, 0.975), + n_folds: int = 3, ) -> TSDataset: - """Make predictions. + """Make a forecast of the next points of a dataset. + + The result of forecasting starts from the last point of ``ts``, not including it. Parameters ---------- + ts: + Dataset to forecast. If not given, dataset given during :py:meth:``fit`` is used. prediction_interval: If True returns prediction interval for forecast quantiles: @@ -201,14 +218,18 @@ def backtest( ts: TSDataset, metrics: List[Metric], n_folds: Union[int, List[FoldMask]] = 5, - mode: str = "expand", + mode: Optional[str] = None, aggregate_metrics: bool = False, n_jobs: int = 1, + refit: Union[bool, int] = True, + stride: Optional[int] = None, joblib_params: Optional[Dict[str, Any]] = None, forecast_params: Optional[Dict[str, Any]] = None, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Run backtest with the pipeline. + If ``refit != True`` and some component of the pipeline doesn't support forecasting with gap, this component will raise an exception. + Parameters ---------- ts: @@ -218,11 +239,23 @@ def backtest( n_folds: Number of folds or the list of fold masks mode: - One of 'expand', 'constant' -- train generation policy + Train generation policy: 'expand' or 'constant'. Works only if ``n_folds`` is integer. + By default, is set to 'expand'. aggregate_metrics: If True aggregate metrics above folds, return raw metrics otherwise n_jobs: Number of jobs to run in parallel + refit: + Determines how often pipeline should be retrained during iteration over folds. + + * If ``True``: pipeline is retrained on each fold. + + * If ``False``: pipeline is trained only on the first fold. + + * If ``value: int``: pipeline is trained every ``value`` folds starting from the first. + + stride: + Number of points between folds. Works only if ``n_folds`` is integer. By default, is set to ``horizon``. joblib_params: Additional parameters for :py:class:`joblib.Parallel` forecast_params: @@ -235,6 +268,15 @@ def backtest( """ +class FoldParallelGroup(TypedDict): + """Group for parallel fold processing.""" + + train_fold_number: int + train_mask: FoldMask + forecast_fold_numbers: List[int] + forecast_masks: List[FoldMask] + + class BasePipeline(AbstractPipeline, BaseMixin): """Base class for pipeline.""" @@ -258,34 +300,29 @@ def _validate_quantiles(quantiles: Sequence[float]) -> Sequence[float]: return quantiles @abstractmethod - def _forecast(self) -> TSDataset: + def _forecast(self, ts: TSDataset) -> TSDataset: """Make predictions.""" pass def _forecast_prediction_interval( - self, predictions: TSDataset, quantiles: Sequence[float], n_folds: int + self, ts: TSDataset, predictions: TSDataset, quantiles: Sequence[float], n_folds: int ) -> TSDataset: """Add prediction intervals to the forecasts.""" - if self.ts is None: - raise ValueError("Pipeline is not fitted! Fit the Pipeline before calling forecast method.") with tslogger.disable(): - _, forecasts, _ = self.backtest(ts=self.ts, metrics=[MAE()], n_folds=n_folds) + _, forecasts, _ = self.backtest(ts=ts, metrics=[MAE()], n_folds=n_folds) - self._add_forecast_borders(backtest_forecasts=forecasts, quantiles=quantiles, predictions=predictions) + self._add_forecast_borders(ts=ts, backtest_forecasts=forecasts, quantiles=quantiles, predictions=predictions) return predictions def _add_forecast_borders( - self, backtest_forecasts: pd.DataFrame, quantiles: Sequence[float], predictions: TSDataset + self, ts: TSDataset, backtest_forecasts: pd.DataFrame, quantiles: Sequence[float], predictions: TSDataset ) -> None: """Estimate prediction intervals and add to the forecasts.""" - if self.ts is None: - raise ValueError("Pipeline is not fitted!") - - backtest_forecasts = TSDataset(df=backtest_forecasts, freq=self.ts.freq) + backtest_forecasts = TSDataset(df=backtest_forecasts, freq=ts.freq) residuals = ( backtest_forecasts.loc[:, pd.IndexSlice[:, "target"]] - - self.ts[backtest_forecasts.index.min() : backtest_forecasts.index.max(), :, "target"] + - ts[backtest_forecasts.index.min() : backtest_forecasts.index.max(), :, "target"] ) sigma = np.std(residuals.values, axis=0) @@ -299,12 +336,20 @@ def _add_forecast_borders( predictions.df = pd.concat([predictions.df] + borders, axis=1).sort_index(axis=1, level=(0, 1)) def forecast( - self, prediction_interval: bool = False, quantiles: Sequence[float] = (0.025, 0.975), n_folds: int = 3 + self, + ts: Optional[TSDataset] = None, + prediction_interval: bool = False, + quantiles: Sequence[float] = (0.025, 0.975), + n_folds: int = 3, ) -> TSDataset: - """Make predictions. + """Make a forecast of the next points of a dataset. + + The result of forecasting starts from the last point of ``ts``, not including it. Parameters ---------- + ts: + Dataset to forecast. If not given, dataset given during :py:meth:``fit`` is used. prediction_interval: If True returns prediction interval for forecast quantiles: @@ -317,18 +362,20 @@ def forecast( : Dataset with predictions """ - if self.ts is None: - raise ValueError( - f"{self.__class__.__name__} is not fitted! Fit the {self.__class__.__name__} " - f"before calling forecast method." - ) + if ts is None: + if self.ts is None: + raise ValueError( + "There is no ts to forecast! Pass ts into forecast method or make sure that pipeline is loaded with ts." + ) + ts = self.ts + self._validate_quantiles(quantiles=quantiles) self._validate_backtest_n_folds(n_folds=n_folds) - predictions = self._forecast() + predictions = self._forecast(ts=ts) if prediction_interval: predictions = self._forecast_prediction_interval( - predictions=predictions, quantiles=quantiles, n_folds=n_folds + ts=ts, predictions=predictions, quantiles=quantiles, n_folds=n_folds ) return predictions @@ -428,43 +475,65 @@ def _init_backtest(self): @staticmethod def _validate_backtest_n_folds(n_folds: int): - """Check that given n_folds value is valid.""" + """Check that given n_folds value is >= 1.""" if n_folds < 1: raise ValueError(f"Folds number should be a positive number, {n_folds} given") @staticmethod - def _validate_backtest_dataset(ts: TSDataset, n_folds: int, horizon: int): + def _validate_backtest_mode(n_folds: Union[int, List[FoldMask]], mode: Optional[str]) -> CrossValidationMode: + if mode is None: + return CrossValidationMode.expand + + if not isinstance(n_folds, int): + raise ValueError("Mode shouldn't be set if n_folds are fold masks!") + + return CrossValidationMode(mode.lower()) + + @staticmethod + def _validate_backtest_stride(n_folds: Union[int, List[FoldMask]], horizon: int, stride: Optional[int]) -> int: + if stride is None: + return horizon + + if not isinstance(n_folds, int): + raise ValueError("Stride shouldn't be set if n_folds are fold masks!") + + if stride < 1: + raise ValueError(f"Stride should be a positive number, {stride} given!") + + return stride + + @staticmethod + def _validate_backtest_dataset(ts: TSDataset, n_folds: int, horizon: int, stride: int): """Check all segments have enough timestamps to validate forecaster with given number of splits.""" - min_required_length = horizon * n_folds + min_required_length = horizon + (n_folds - 1) * stride segments = set(ts.df.columns.get_level_values("segment")) for segment in segments: segment_target = ts[:, segment, "target"] if len(segment_target) < min_required_length: raise ValueError( f"All the series from feature dataframe should contain at least " - f"{horizon} * {n_folds} = {min_required_length} timestamps; " + f"{horizon} + {n_folds-1} * {stride} = {min_required_length} timestamps; " f"series {segment} does not." ) @staticmethod - def _generate_masks_from_n_folds(ts: TSDataset, n_folds: int, horizon: int, mode: str) -> List[FoldMask]: + def _generate_masks_from_n_folds( + ts: TSDataset, n_folds: int, horizon: int, mode: CrossValidationMode, stride: int + ) -> List[FoldMask]: """Generate fold masks from n_folds.""" - mode_enum = CrossValidationMode(mode.lower()) - if mode_enum == CrossValidationMode.expand: + if mode is CrossValidationMode.expand: constant_history_length = 0 - elif mode_enum == CrossValidationMode.constant: + elif mode is CrossValidationMode.constant: constant_history_length = 1 else: - raise NotImplementedError( - f"Only '{CrossValidationMode.expand}' and '{CrossValidationMode.constant}' modes allowed" - ) + assert_never(mode) masks = [] dataset_timestamps = list(ts.index) min_timestamp_idx, max_timestamp_idx = 0, len(dataset_timestamps) for offset in range(n_folds, 0, -1): - min_train_idx = min_timestamp_idx + (n_folds - offset) * horizon * constant_history_length - max_train_idx = max_timestamp_idx - horizon * offset - 1 + min_train_idx = min_timestamp_idx + (n_folds - offset) * stride * constant_history_length + max_train_idx = max_timestamp_idx - stride * (offset - 1) - horizon - 1 min_test_idx = max_train_idx + 1 max_test_idx = max_train_idx + horizon @@ -521,21 +590,40 @@ def _compute_metrics( metrics_values[metric.name] = metric(y_true=y_true, y_pred=y_pred) # type: ignore return metrics_values - def _run_fold( + def _fit_backtest_pipeline( + self, + ts: TSDataset, + fold_number: int, + ) -> "BasePipeline": + """Fit pipeline for a given data in backtest.""" + tslogger.start_experiment(job_type="training", group=str(fold_number)) + pipeline = deepcopy(self) + pipeline.fit(ts=ts) + tslogger.finish_experiment() + return pipeline + + def _forecast_backtest_pipeline( + self, pipeline: "BasePipeline", ts: TSDataset, fold_number: int, forecast_params: Dict[str, Any] + ) -> TSDataset: + """Make a forecast with a given pipeline in backtest.""" + tslogger.start_experiment(job_type="forecasting", group=str(fold_number)) + forecast = pipeline.forecast(ts=ts, **forecast_params) + tslogger.finish_experiment() + return forecast + + def _process_fold_forecast( self, + forecast: TSDataset, train: TSDataset, test: TSDataset, + pipeline: "BasePipeline", fold_number: int, mask: FoldMask, metrics: List[Metric], - forecast_params: Dict[str, Any], ) -> Dict[str, Any]: - """Run fit-forecast pipeline of model for one fold.""" + """Process forecast made for a fold.""" tslogger.start_experiment(job_type="crossval", group=str(fold_number)) - pipeline = deepcopy(self) - pipeline.fit(ts=train) - forecast = pipeline.forecast(**forecast_params) fold: Dict[str, Any] = {} for stage_name, stage_df in zip(("train", "test"), (train, test)): fold[f"{stage_name}_timerange"] = {} @@ -583,7 +671,7 @@ def _get_fold_info(self) -> pd.DataFrame: tmp_df[f"{stage_name}_{border}_time"] = [fold_info[f"{stage_name}_timerange"][border]] tmp_df[self._fold_column] = fold_number timerange_dfs.append(tmp_df) - timerange_df = pd.concat(timerange_dfs) + timerange_df = pd.concat(timerange_dfs, ignore_index=True) return timerange_df def _get_backtest_forecasts(self) -> pd.DataFrame: @@ -606,12 +694,16 @@ def _get_backtest_forecasts(self) -> pd.DataFrame: forecasts.sort_index(axis=1, inplace=True) return forecasts - def _prepare_fold_masks(self, ts: TSDataset, masks: Union[int, List[FoldMask]], mode: str) -> List[FoldMask]: + def _prepare_fold_masks( + self, ts: TSDataset, masks: Union[int, List[FoldMask]], mode: CrossValidationMode, stride: int + ) -> List[FoldMask]: """Prepare and validate fold masks.""" if isinstance(masks, int): self._validate_backtest_n_folds(n_folds=masks) - self._validate_backtest_dataset(ts=ts, n_folds=masks, horizon=self.horizon) - masks = self._generate_masks_from_n_folds(ts=ts, n_folds=masks, horizon=self.horizon, mode=mode) + self._validate_backtest_dataset(ts=ts, n_folds=masks, horizon=self.horizon, stride=stride) + masks = self._generate_masks_from_n_folds( + ts=ts, n_folds=masks, horizon=self.horizon, mode=mode, stride=stride + ) for i, mask in enumerate(masks): mask.first_train_timestamp = mask.first_train_timestamp if mask.first_train_timestamp else ts.index[0] masks[i] = mask @@ -619,19 +711,126 @@ def _prepare_fold_masks(self, ts: TSDataset, masks: Union[int, List[FoldMask]], mask.validate_on_dataset(ts=ts, horizon=self.horizon) return masks + @staticmethod + def _make_backtest_fold_groups(masks: List[FoldMask], refit: Union[bool, int]) -> List[FoldParallelGroup]: + """Make groups of folds for backtest.""" + if not refit: + refit = len(masks) + + grouped_folds = [] + num_groups = math.ceil(len(masks) / refit) + for group_id in range(num_groups): + train_fold_number = group_id * refit + forecast_fold_numbers = [train_fold_number + i for i in range(refit) if train_fold_number + i < len(masks)] + cur_group: FoldParallelGroup = { + "train_fold_number": train_fold_number, + "train_mask": masks[train_fold_number], + "forecast_fold_numbers": forecast_fold_numbers, + "forecast_masks": [masks[i] for i in forecast_fold_numbers], + } + grouped_folds.append(cur_group) + + return grouped_folds + + def _run_all_folds( + self, + masks: List[FoldMask], + ts: TSDataset, + metrics: List[Metric], + n_jobs: int, + refit: Union[bool, int], + joblib_params: Dict[str, Any], + forecast_params: Dict[str, Any], + ) -> Dict[int, Any]: + """Run pipeline on all folds.""" + fold_groups = self._make_backtest_fold_groups(masks=masks, refit=refit) + + with Parallel(n_jobs=n_jobs, **joblib_params) as parallel: + # fitting + fit_masks = [group["train_mask"] for group in fold_groups] + fit_datasets = ( + train for train, _ in self._generate_folds_datasets(ts=ts, masks=fit_masks, horizon=self.horizon) + ) + pipelines = parallel( + delayed(self._fit_backtest_pipeline)(ts=fit_ts, fold_number=fold_groups[group_idx]["train_fold_number"]) + for group_idx, fit_ts in enumerate(fit_datasets) + ) + + # forecasting + forecast_masks = [group["forecast_masks"] for group in fold_groups] + forecast_datasets = ( + ( + train + for train, _ in self._generate_folds_datasets( + ts=ts, masks=group_forecast_masks, horizon=self.horizon + ) + ) + for group_forecast_masks in forecast_masks + ) + forecasts_flat = parallel( + delayed(self._forecast_backtest_pipeline)( + ts=forecast_ts, + pipeline=pipelines[group_idx], + fold_number=fold_groups[group_idx]["forecast_fold_numbers"][idx], + forecast_params=forecast_params, + ) + for group_idx, group_forecast_datasets in enumerate(forecast_datasets) + for idx, forecast_ts in enumerate(group_forecast_datasets) + ) + + # processing forecasts + fold_process_train_datasets = ( + train for train, _ in self._generate_folds_datasets(ts=ts, masks=fit_masks, horizon=self.horizon) + ) + fold_process_test_datasets = ( + ( + test + for _, test in self._generate_folds_datasets( + ts=ts, masks=group_forecast_masks, horizon=self.horizon + ) + ) + for group_forecast_masks in forecast_masks + ) + fold_results_flat = parallel( + delayed(self._process_fold_forecast)( + forecast=forecasts_flat[group_idx * refit + idx], + train=train, + test=test, + pipeline=pipelines[group_idx], + fold_number=fold_groups[group_idx]["forecast_fold_numbers"][idx], + mask=fold_groups[group_idx]["forecast_masks"][idx], + metrics=metrics, + ) + for group_idx, (train, group_fold_process_test_datasets) in enumerate( + zip(fold_process_train_datasets, fold_process_test_datasets) + ) + for idx, test in enumerate(group_fold_process_test_datasets) + ) + + results = { + fold_number: fold_results_flat[group_idx * refit + idx] + for group_idx in range(len(fold_groups)) + for idx, fold_number in enumerate(fold_groups[group_idx]["forecast_fold_numbers"]) + } + return results + def backtest( self, ts: TSDataset, metrics: List[Metric], n_folds: Union[int, List[FoldMask]] = 5, - mode: str = "expand", + mode: Optional[str] = None, aggregate_metrics: bool = False, n_jobs: int = 1, + refit: Union[bool, int] = True, + stride: Optional[int] = None, joblib_params: Optional[Dict[str, Any]] = None, forecast_params: Optional[Dict[str, Any]] = None, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Run backtest with the pipeline. + If ``refit != True`` and some component of the pipeline doesn't support forecasting with gap, this component will raise an exception. + Parameters ---------- ts: @@ -641,11 +840,23 @@ def backtest( n_folds: Number of folds or the list of fold masks mode: - One of 'expand', 'constant' -- train generation policy, ignored if n_folds is a list of masks + Train generation policy: 'expand' or 'constant'. Works only if ``n_folds`` is integer. + By default, is set to 'expand'. aggregate_metrics: If True aggregate metrics above folds, return raw metrics otherwise n_jobs: Number of jobs to run in parallel + refit: + Determines how often pipeline should be retrained during iteration over folds. + + * If ``True``: pipeline is retrained on each fold. + + * If ``False``: pipeline is trained only on the first fold. + + * If ``value: int``: pipeline is trained every ``value`` folds starting from the first. + + stride: + Number of points between folds. Works only if ``n_folds`` is integer. By default, is set to ``horizon``. joblib_params: Additional parameters for :py:class:`joblib.Parallel` forecast_params: @@ -655,7 +866,17 @@ def backtest( ------- metrics_df, forecast_df, fold_info_df: Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] Metrics dataframe, forecast dataframe and dataframe with information about folds + + Raises + ------ + ValueError: + If ``mode`` is set when ``n_folds`` are ``List[FoldMask]``. + ValueError: + If ``stride`` is set when ``n_folds`` are ``List[FoldMask]``. """ + mode_enum = self._validate_backtest_mode(n_folds=n_folds, mode=mode) + stride = self._validate_backtest_stride(n_folds=n_folds, horizon=self.horizon, stride=stride) + if joblib_params is None: joblib_params = dict(verbose=11, backend="multiprocessing", mmap_mode="c") @@ -664,22 +885,16 @@ def backtest( self._init_backtest() self._validate_backtest_metrics(metrics=metrics) - masks = self._prepare_fold_masks(ts=ts, masks=n_folds, mode=mode) - - folds = Parallel(n_jobs=n_jobs, **joblib_params)( - delayed(self._run_fold)( - train=train, - test=test, - fold_number=fold_number, - mask=masks[fold_number], - metrics=metrics, - forecast_params=forecast_params, - ) - for fold_number, (train, test) in enumerate( - self._generate_folds_datasets(ts=ts, masks=masks, horizon=self.horizon) - ) + masks = self._prepare_fold_masks(ts=ts, masks=n_folds, mode=mode_enum, stride=stride) + self._folds = self._run_all_folds( + masks=masks, + ts=ts, + metrics=metrics, + n_jobs=n_jobs, + refit=refit, + joblib_params=joblib_params, + forecast_params=forecast_params, ) - self._folds = {i: fold for i, fold in enumerate(folds)} metrics_df = self._get_backtest_metrics(aggregate_metrics=aggregate_metrics) forecast_df = self._get_backtest_forecasts() diff --git a/etna/pipeline/hierarchical_pipeline.py b/etna/pipeline/hierarchical_pipeline.py index c7ca590ed..85066e42c 100644 --- a/etna/pipeline/hierarchical_pipeline.py +++ b/etna/pipeline/hierarchical_pipeline.py @@ -1,3 +1,4 @@ +import pathlib from copy import deepcopy from typing import Dict from typing import List @@ -67,12 +68,20 @@ def fit(self, ts: TSDataset) -> "HierarchicalPipeline": return self def raw_forecast( - self, prediction_interval: bool = False, quantiles: Sequence[float] = (0.25, 0.75), n_folds: int = 3 + self, + ts: TSDataset, + prediction_interval: bool = False, + quantiles: Sequence[float] = (0.25, 0.75), + n_folds: int = 3, ) -> TSDataset: - """Make a prediction for target at the source level of hierarchy. + """Make a forecast of the next points of a dataset on a source level. + + The result of forecasting starts from the last point of ``ts``, not including it. Parameters ---------- + ts: + Dataset to forecast prediction_interval: If True returns prediction interval for forecast quantiles: @@ -85,25 +94,41 @@ def raw_forecast( : Dataset with predictions at the source level """ - forecast = super().forecast(prediction_interval=prediction_interval, quantiles=quantiles, n_folds=n_folds) - target_columns = tuple(get_target_with_quantiles(columns=forecast.columns)) + # handle `prediction_interval=True` separately + source_ts = self.reconciliator.aggregate(ts=ts) + forecast = super().forecast(ts=source_ts, prediction_interval=False, n_folds=n_folds) + if prediction_interval: + forecast = self._forecast_prediction_interval( + ts=ts, predictions=forecast, quantiles=quantiles, n_folds=n_folds + ) + target_columns = tuple(get_target_with_quantiles(columns=forecast.columns)) hierarchical_forecast = TSDataset( df=forecast[..., target_columns], freq=forecast.freq, df_exog=forecast.df_exog, known_future=forecast.known_future, - hierarchical_structure=self.ts.hierarchical_structure, # type: ignore + hierarchical_structure=ts.hierarchical_structure, # type: ignore ) return hierarchical_forecast def forecast( - self, prediction_interval: bool = False, quantiles: Sequence[float] = (0.025, 0.975), n_folds: int = 3 + self, + ts: Optional[TSDataset] = None, + prediction_interval: bool = False, + quantiles: Sequence[float] = (0.025, 0.975), + n_folds: int = 3, ) -> TSDataset: - """Make a prediction for target at the source level of hierarchy and make reconciliation to target level. + """Make a forecast of the next points of a dataset on a target level. + + The result of forecasting starts from the last point of ``ts``, not including it. + + Method makes a prediction for target at the source level of hierarchy and then makes reconciliation to target level. Parameters ---------- + ts: + Dataset to forecast. If not given, dataset given during :py:meth:``fit`` is used. prediction_interval: If True returns prediction interval for forecast quantiles: @@ -116,7 +141,16 @@ def forecast( : Dataset with predictions at the target level of hierarchy. """ - forecast = self.raw_forecast(prediction_interval=prediction_interval, quantiles=quantiles, n_folds=n_folds) + if ts is None: + if self._fit_ts is None: + raise ValueError( + "There is no ts to forecast! Pass ts into forecast method or make sure that pipeline is loaded with ts." + ) + ts = self._fit_ts + + forecast = self.raw_forecast( + ts=ts, prediction_interval=prediction_interval, quantiles=quantiles, n_folds=n_folds + ) forecast_reconciled = self.reconciliator.reconcile(forecast) return forecast_reconciled @@ -136,21 +170,68 @@ def _compute_metrics( return metrics_values def _forecast_prediction_interval( - self, predictions: TSDataset, quantiles: Sequence[float], n_folds: int + self, ts: TSDataset, predictions: TSDataset, quantiles: Sequence[float], n_folds: int ) -> TSDataset: """Add prediction intervals to the forecasts.""" + # TODO: fix this: what if during backtest KeyboardInterrupt is raised self.forecast, self.raw_forecast = self.raw_forecast, self.forecast # type: ignore - if self.ts is None or self._fit_ts is None: + if self.ts is None: raise ValueError("Pipeline is not fitted! Fit the Pipeline before calling forecast method.") # TODO: rework intervals estimation for `BottomUpReconciliator` with tslogger.disable(): - _, forecasts, _ = self.backtest(ts=self._fit_ts, metrics=[MAE()], n_folds=n_folds) + _, forecasts, _ = self.backtest(ts=ts, metrics=[MAE()], n_folds=n_folds) - self._add_forecast_borders(backtest_forecasts=forecasts, quantiles=quantiles, predictions=predictions) + source_ts = self.reconciliator.aggregate(ts=ts) + self._add_forecast_borders( + ts=source_ts, backtest_forecasts=forecasts, quantiles=quantiles, predictions=predictions + ) self.forecast, self.raw_forecast = self.raw_forecast, self.forecast # type: ignore return predictions + + def save(self, path: pathlib.Path): + """Save the object. + + Parameters + ---------- + path: + Path to save object to. + """ + fit_ts = self._fit_ts + + try: + # extract attributes we can't easily save + delattr(self, "_fit_ts") + + # save the remaining part + super().save(path=path) + finally: + self._fit_ts = fit_ts + + @classmethod + def load(cls, path: pathlib.Path, ts: Optional[TSDataset] = None) -> "HierarchicalPipeline": + """Load an object. + + Parameters + ---------- + path: + Path to load object from. + ts: + TSDataset to set into loaded pipeline. + + Returns + ------- + : + Loaded object. + """ + obj = super().load(path=path) + obj._fit_ts = deepcopy(ts) + if ts is not None: + obj.ts = obj.reconciliator.aggregate(ts=ts) + else: + obj.ts = None + return obj diff --git a/etna/pipeline/pipeline.py b/etna/pipeline/pipeline.py index 456925bd5..09080360c 100644 --- a/etna/pipeline/pipeline.py +++ b/etna/pipeline/pipeline.py @@ -1,3 +1,4 @@ +from typing import Optional from typing import Sequence from typing import cast @@ -56,28 +57,36 @@ def fit(self, ts: TSDataset) -> "Pipeline": self.ts.inverse_transform() return self - def _forecast(self) -> TSDataset: + def _forecast(self, ts: TSDataset) -> TSDataset: """Make predictions.""" - if self.ts is None: - raise ValueError("Something went wrong, ts is None!") + # because make_future uses `ts.transforms` + ts.transforms = self.transforms if isinstance(self.model, get_args(ContextRequiredModelType)): self.model = cast(ContextRequiredModelType, self.model) - future = self.ts.make_future(future_steps=self.horizon, tail_steps=self.model.context_size) + future = ts.make_future(future_steps=self.horizon, tail_steps=self.model.context_size) predictions = self.model.forecast(ts=future, prediction_size=self.horizon) else: self.model = cast(ContextIgnorantModelType, self.model) - future = self.ts.make_future(future_steps=self.horizon) + future = ts.make_future(future_steps=self.horizon) predictions = self.model.forecast(ts=future) return predictions def forecast( - self, prediction_interval: bool = False, quantiles: Sequence[float] = (0.025, 0.975), n_folds: int = 3 + self, + ts: Optional[TSDataset] = None, + prediction_interval: bool = False, + quantiles: Sequence[float] = (0.025, 0.975), + n_folds: int = 3, ) -> TSDataset: - """Make predictions. + """Make a forecast of the next points of a dataset. + + The result of forecasting starts from the last point of ``ts``, not including it. Parameters ---------- + ts: + Dataset to forecast. If not given, dataset given during :py:meth:``fit`` is used. prediction_interval: If True returns prediction interval for forecast quantiles: @@ -90,24 +99,29 @@ def forecast( : Dataset with predictions """ - if self.ts is None: - raise ValueError( - f"{self.__class__.__name__} is not fitted! Fit the {self.__class__.__name__} " - f"before calling forecast method." - ) + if ts is None: + if self.ts is None: + raise ValueError( + "There is no ts to forecast! Pass ts into forecast method or make sure that pipeline is loaded with ts." + ) + ts = self.ts + else: + # because make_future uses `ts.transforms` + ts.transforms = self.transforms + self._validate_quantiles(quantiles=quantiles) self._validate_backtest_n_folds(n_folds=n_folds) if prediction_interval and isinstance(self.model, PredictionIntervalContextIgnorantAbstractModel): - future = self.ts.make_future(future_steps=self.horizon) + future = ts.make_future(future_steps=self.horizon) predictions = self.model.forecast(ts=future, prediction_interval=prediction_interval, quantiles=quantiles) elif prediction_interval and isinstance(self.model, PredictionIntervalContextRequiredAbstractModel): - future = self.ts.make_future(future_steps=self.horizon, tail_steps=self.model.context_size) + future = ts.make_future(future_steps=self.horizon, tail_steps=self.model.context_size) predictions = self.model.forecast( ts=future, prediction_size=self.horizon, prediction_interval=prediction_interval, quantiles=quantiles ) else: predictions = super().forecast( - prediction_interval=prediction_interval, quantiles=quantiles, n_folds=n_folds + ts=ts, prediction_interval=prediction_interval, quantiles=quantiles, n_folds=n_folds ) return predictions diff --git a/etna/transforms/base.py b/etna/transforms/base.py index d71070fe6..1fa72895c 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -94,14 +94,19 @@ def fit(self, df: pd.DataFrame) -> "PerSegmentWrapper": def transform(self, df: pd.DataFrame) -> pd.DataFrame: """Apply transform to each segment separately.""" results = [] - for key, value in self.segment_transforms.items(): - seg_df = value.transform(df[key]) + segments = set(df.columns.get_level_values("segment")) + for segment in segments: + if segment not in self.segment_transforms: + raise NotImplementedError("Per-segment transforms can't work on new segments!") + + segment_transform = self.segment_transforms[segment] + seg_df = segment_transform.transform(df[segment]) _idx = seg_df.columns.to_frame() - _idx.insert(0, "segment", key) + _idx.insert(0, "segment", segment) seg_df.columns = pd.MultiIndex.from_frame(_idx) - results.append(seg_df) + df = pd.concat(results, axis=1) df = df.sort_index(axis=1) df.columns.names = ["segment", "feature"] @@ -110,11 +115,16 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: """Apply inverse_transform to each segment.""" results = [] - for key, value in self.segment_transforms.items(): - seg_df = value.inverse_transform(df[key]) + segments = set(df.columns.get_level_values("segment")) + for segment in segments: + if segment not in self.segment_transforms: + raise NotImplementedError("Per-segment transforms can't work on new segments!") + + segment_transform = self.segment_transforms[segment] + seg_df = segment_transform.inverse_transform(df[segment]) _idx = seg_df.columns.to_frame() - _idx.insert(0, "segment", key) + _idx.insert(0, "segment", segment) seg_df.columns = pd.MultiIndex.from_frame(_idx) results.append(seg_df) diff --git a/etna/transforms/encoders/mean_segment_encoder.py b/etna/transforms/encoders/mean_segment_encoder.py index 8f518441c..60fab1462 100644 --- a/etna/transforms/encoders/mean_segment_encoder.py +++ b/etna/transforms/encoders/mean_segment_encoder.py @@ -1,4 +1,7 @@ -import numpy as np +import reprlib +from typing import Dict +from typing import Optional + import pandas as pd from etna.transforms import Transform @@ -13,7 +16,7 @@ class MeanSegmentEncoderTransform(Transform, FutureMixin): def __init__(self): self.mean_encoder = MeanTransform(in_column="target", window=-1, out_column="segment_mean") - self.global_means: np.ndarray[float] = None + self.global_means: Optional[Dict[str, float]] = None def fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform": """ @@ -30,7 +33,9 @@ def fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform": Fitted transform """ self.mean_encoder.fit(df) - self.global_means = df.loc[:, self.idx[:, "target"]].mean().values + mean_values = df.loc[:, self.idx[:, "target"]].mean().to_dict() + mean_values = {key[0]: value for key, value in mean_values.items()} + self.global_means = mean_values return self def transform(self, df: pd.DataFrame) -> pd.DataFrame: @@ -46,9 +51,26 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: ------- : result dataframe + + Raises + ------ + ValueError: + If transform isn't fitted. + NotImplementedError: + If there are segments that weren't present during training. """ + if self.global_means is None: + raise ValueError("The transform isn't fitted!") + + segments = df.columns.get_level_values("segment").unique().tolist() + new_segments = set(segments) - self.global_means.keys() + if len(new_segments) > 0: + raise NotImplementedError( + f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}" + ) + df = self.mean_encoder.transform(df) - segment = df.columns.get_level_values("segment").unique()[0] + segment = segments[0] nan_timestamps = df[df.loc[:, self.idx[segment, "target"]].isna()].index - df.loc[nan_timestamps, self.idx[:, "segment_mean"]] = self.global_means + df.loc[nan_timestamps, self.idx[:, "segment_mean"]] = [self.global_means[x] for x in segments] return df diff --git a/etna/transforms/encoders/segment_encoder.py b/etna/transforms/encoders/segment_encoder.py index e899b8eac..f54a7a13d 100644 --- a/etna/transforms/encoders/segment_encoder.py +++ b/etna/transforms/encoders/segment_encoder.py @@ -1,3 +1,6 @@ +import reprlib + +import numpy as np import pandas as pd from sklearn import preprocessing @@ -44,12 +47,31 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: ------- : result dataframe + + Raises + ------ + ValueError: + If transform isn't fitted. + NotImplementedError: + If there are segments that weren't present during training. """ - encoded_matrix = self._le.transform(self._le.classes_) - encoded_matrix = encoded_matrix.reshape(len(self._le.classes_), -1).repeat(len(df), axis=1).T + segments = df.columns.get_level_values("segment").unique().tolist() + + try: + new_segments = set(segments) - set(self._le.classes_) + except AttributeError: + raise ValueError("The transform isn't fitted!") + + if len(new_segments) > 0: + raise NotImplementedError( + f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}" + ) + + encoded_matrix = self._le.transform(segments) + encoded_matrix = np.tile(encoded_matrix, (len(df), 1)) encoded_df = pd.DataFrame( encoded_matrix, - columns=pd.MultiIndex.from_product([self._le.classes_, ["segment_code"]], names=("segment", "feature")), + columns=pd.MultiIndex.from_product([segments, ["segment_code"]], names=("segment", "feature")), index=df.index, ) encoded_df = encoded_df.astype("category") diff --git a/etna/transforms/math/differencing.py b/etna/transforms/math/differencing.py index 74f0a9e10..692066429 100644 --- a/etna/transforms/math/differencing.py +++ b/etna/transforms/math/differencing.py @@ -3,18 +3,21 @@ from typing import Optional from typing import Set from typing import Union +from typing import cast import numpy as np import pandas as pd from etna.transforms.base import Transform +from etna.transforms.utils import check_new_segments from etna.transforms.utils import match_target_quantiles class _SingleDifferencingTransform(Transform): """Calculate a time series differences of order 1. - This transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment. + During ``fit`` this transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment. + During ``transform`` and ``inverse_transform`` there is no special treatment of NaNs. Notes ----- @@ -84,12 +87,16 @@ def fit(self, df: pd.DataFrame) -> "_SingleDifferencingTransform": Returns ------- result: _SingleDifferencingTransform + + Raises + ------ + ValueError: + if NaNs are present inside the segment """ segments = sorted(set(df.columns.get_level_values("segment"))) fit_df = df.loc[:, pd.IndexSlice[segments, self.in_column]].copy() - self._train_timestamp = fit_df.index - self._train_init_dict = {} + train_init_dict = {} for current_segment in segments: cur_series = fit_df.loc[:, pd.IndexSlice[current_segment, self.in_column]] cur_series = cur_series.loc[cur_series.first_valid_index() :] @@ -97,7 +104,10 @@ def fit(self, df: pd.DataFrame) -> "_SingleDifferencingTransform": if cur_series.isna().sum() > 0: raise ValueError(f"There should be no NaNs inside the segments") - self._train_init_dict[current_segment] = cur_series[: self.period] + train_init_dict[current_segment] = cur_series[: self.period] + + self._train_init_dict = train_init_dict + self._train_timestamp = fit_df.index self._test_init_df = fit_df.iloc[-self.period :, :] # make multiindex levels consistent self._test_init_df.columns = self._test_init_df.columns.remove_unused_levels() @@ -113,21 +123,15 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: Returns ------- - result: pd.Dataframe + result: transformed dataframe """ - if self._train_init_dict is None or self._test_init_df is None or self._train_timestamp is None: - raise AttributeError("Transform is not fitted") - segments = sorted(set(df.columns.get_level_values("segment"))) transformed = df.loc[:, pd.IndexSlice[segments, self.in_column]].copy() for current_segment in segments: to_transform = transformed.loc[:, pd.IndexSlice[current_segment, self.in_column]] - start_idx = to_transform.first_valid_index() # make a differentiation - transformed.loc[start_idx:, pd.IndexSlice[current_segment, self.in_column]] = to_transform.loc[ - start_idx: - ].diff(periods=self.period) + transformed.loc[:, pd.IndexSlice[current_segment, self.in_column]] = to_transform.diff(periods=self.period) if self.inplace: result_df = df.copy() @@ -184,12 +188,9 @@ def _reconstruct_test(self, df: pd.DataFrame, columns_to_inverse: Set[str]) -> p to_transform = df.loc[:, pd.IndexSlice[segments, column]].copy() init_df = self._test_init_df.copy() # type: ignore init_df.columns.set_levels([column], level="feature", inplace=True) + init_df = init_df[segments] to_transform = pd.concat([init_df, to_transform]) - # validate values inside the series to transform - if to_transform.isna().sum().sum() > 0: - raise ValueError(f"There should be no NaNs inside the segments") - # run reconstruction and save the result to_transform = self._make_inv_diff(to_transform) result_df.loc[:, pd.IndexSlice[segments, column]] = to_transform @@ -206,11 +207,18 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: Returns ------- - result: pd.DataFrame + result: transformed DataFrame. + + Raises + ------ + ValueError: + if inverse transform is applied not to full train nor to test that goes after train + ValueError: + if inverse transform is applied to test that goes after train with gap """ - if self._train_init_dict is None or self._test_init_df is None or self._train_timestamp is None: - raise AttributeError("Transform is not fitted") + # we assume this to be fitted + self._train_timestamp = cast(pd.DatetimeIndex, self._train_timestamp) if not self.inplace: return df @@ -239,7 +247,8 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: class DifferencingTransform(Transform): """Calculate a time series differences. - This transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment. + During ``fit`` this transform can work with NaNs at the beginning of the segment, but fails when meets NaN inside the segment. + During ``transform`` and ``inverse_transform`` there is no special treatment of NaNs. Notes ----- @@ -311,6 +320,7 @@ def __init__( self._differencing_transforms.append( _SingleDifferencingTransform(in_column=result_out_column, period=self.period, inplace=True) ) + self._fit_segments: Optional[List[str]] = None def _get_column_name(self) -> str: if self.inplace: @@ -331,13 +341,23 @@ def fit(self, df: pd.DataFrame) -> "DifferencingTransform": Returns ------- result: DifferencingTransform + + Raises + ------ + ValueError: + if NaNs are present inside the segment """ # this is made because transforms of high order may need some columns created by transforms of lower order result_df = df.copy() for transform in self._differencing_transforms: result_df = transform.fit_transform(result_df) + self._fit_segments = df.columns.get_level_values("segment").unique().tolist() return self + def _check_is_fitted(self): + if self._fit_segments is None: + raise ValueError("Transform is not fitted!") + def transform(self, df: pd.DataFrame) -> pd.DataFrame: """Make a differencing transformation. @@ -348,9 +368,21 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: Returns ------- - result: pd.Dataframe + result: transformed dataframe + + Raises + ------ + ValueError: + if transform isn't fitted + NotImplementedError: + if there are segments that weren't present during training """ + self._check_is_fitted() + segments = df.columns.get_level_values("segment").unique().tolist() + if self.inplace: + check_new_segments(transform_segments=segments, fit_segments=self._fit_segments) + result_df = df.copy() for transform in self._differencing_transforms: result_df = transform.transform(result_df) @@ -366,12 +398,27 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: Returns ------- - result: pd.DataFrame + result: transformed DataFrame. + + Raises + ------ + ValueError: + if transform isn't fitted + NotImplementedError: + if there are segments that weren't present during training + ValueError: + if inverse transform is applied not to full train nor to test that goes after train + ValueError: + if inverse transform is applied to test that goes after train with gap """ + self._check_is_fitted() if not self.inplace: return df + segments = df.columns.get_level_values("segment").unique().tolist() + check_new_segments(transform_segments=segments, fit_segments=self._fit_segments) + result_df = df.copy() for transform in self._differencing_transforms[::-1]: result_df = transform.inverse_transform(result_df) diff --git a/etna/transforms/math/sklearn.py b/etna/transforms/math/sklearn.py index bd32db9d6..c4c56aee4 100644 --- a/etna/transforms/math/sklearn.py +++ b/etna/transforms/math/sklearn.py @@ -4,6 +4,7 @@ from typing import List from typing import Optional from typing import Union +from typing import cast import numpy as np import pandas as pd @@ -12,6 +13,7 @@ from etna.core import StringEnumWithRepr from etna.datasets import set_columns_wide from etna.transforms.base import Transform +from etna.transforms.utils import check_new_segments from etna.transforms.utils import match_target_quantiles @@ -72,6 +74,7 @@ def __init__( self.out_column = out_column self.out_columns: Optional[List[str]] = None + self._fit_segments: Optional[List[str]] = None def _get_column_name(self, in_column: str) -> str: if self.out_column is None: @@ -104,10 +107,11 @@ def fit(self, df: pd.DataFrame) -> "SklearnTransform": else: self.out_columns = [self._get_column_name(column) for column in self.in_column] + self._fit_segments = df.columns.get_level_values("segment").unique().tolist() if self.mode == TransformMode.per_segment: x = df.loc[:, pd.IndexSlice[:, self.in_column]].values elif self.mode == TransformMode.macro: - x = self._reshape(df) + x = self._preprocess_macro(df) else: raise ValueError(f"'{self.mode}' is not a valid TransformMode.") @@ -127,23 +131,26 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: ------- : transformed DataFrame. + + Raises + ------ + ValueError: + If transform isn't fitted. + NotImplementedError: + If there are segments that weren't present during training. """ - df = df.sort_index(axis=1) - segments = sorted(set(df.columns.get_level_values("segment"))) + if self._fit_segments is None: + raise ValueError("The transform isn't fitted!") + else: + self.in_column = cast(List[str], self.in_column) - if self.mode == TransformMode.per_segment: - x = df.loc[:, pd.IndexSlice[:, self.in_column]].values - transformed = self.transformer.transform(X=x) + df = df.sort_index(axis=1) + transformed = self._make_transform(df) - elif self.mode == TransformMode.macro: - x = self._reshape(df) - transformed = self.transformer.transform(X=x) - transformed = self._inverse_reshape(df, transformed) - else: - raise ValueError(f"'{self.mode}' is not a valid TransformMode.") if self.inplace: df.loc[:, pd.IndexSlice[:, self.in_column]] = transformed else: + segments = sorted(set(df.columns.get_level_values("segment"))) transformed_features = pd.DataFrame( transformed, columns=df.loc[:, pd.IndexSlice[:, self.in_column]].columns, index=df.index ).sort_index(axis=1) @@ -166,10 +173,20 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: ------- : transformed DataFrame. + + Raises + ------ + ValueError: + If transform isn't fitted. + NotImplementedError: + If there are segments that weren't present during training. """ + if self._fit_segments is None: + raise ValueError("The transform isn't fitted!") + else: + self.in_column = cast(List[str], self.in_column) + df = df.sort_index(axis=1) - if self.in_column is None: - raise ValueError("Transform is not fitted yet.") if "target" in self.in_column: quantiles = match_target_quantiles(set(df.columns.get_level_values("feature"))) @@ -178,60 +195,85 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: if self.inplace: quantiles_arrays: Dict[str, pd.DataFrame] = dict() + transformed = self._make_inverse_transform(df) - if self.mode == TransformMode.per_segment: - x = df.loc[:, pd.IndexSlice[:, self.in_column]].values - transformed = self.transformer.inverse_transform(X=x) - - # quantiles inverse transformation - for quantile_column_nm in quantiles: - df_slice_copy = df.loc[:, pd.IndexSlice[:, self.in_column]].copy() - df_slice_copy = set_columns_wide( - df_slice_copy, df, features_left=["target"], features_right=[quantile_column_nm] - ) - transformed_quantile = self.transformer.inverse_transform(X=df_slice_copy) - df_slice_copy.loc[:, pd.IndexSlice[:, self.in_column]] = transformed_quantile - quantiles_arrays[quantile_column_nm] = df_slice_copy.loc[:, pd.IndexSlice[:, "target"]].rename( - columns={"target": quantile_column_nm} - ) - - elif self.mode == TransformMode.macro: - x = self._reshape(df) - transformed = self.transformer.inverse_transform(X=x) - transformed = self._inverse_reshape(df, transformed) - - # quantiles inverse transformation - for quantile_column_nm in quantiles: - df_slice_copy = df.loc[:, pd.IndexSlice[:, self.in_column]].copy() - df_slice_copy = set_columns_wide( - df_slice_copy, df, features_left=["target"], features_right=[quantile_column_nm] - ) - df_slice_copy_reshaped_array = self._reshape(df_slice_copy) - transformed_quantile = self.transformer.inverse_transform(X=df_slice_copy_reshaped_array) - inverse_reshaped_quantile = self._inverse_reshape(df_slice_copy, transformed_quantile) - df_slice_copy.loc[:, pd.IndexSlice[:, self.in_column]] = inverse_reshaped_quantile - quantiles_arrays[quantile_column_nm] = df_slice_copy.loc[:, pd.IndexSlice[:, "target"]].rename( - columns={"target": quantile_column_nm} - ) - - else: - raise ValueError(f"'{self.mode}' is not a valid TransformMode.") - df.loc[:, pd.IndexSlice[:, self.in_column]] = transformed + # quantiles inverse transformation + for quantile_column_nm in quantiles: + df_slice_copy = df.loc[:, pd.IndexSlice[:, self.in_column]].copy() + df_slice_copy = set_columns_wide( + df_slice_copy, df, features_left=["target"], features_right=[quantile_column_nm] + ) + transformed_quantile = self._make_inverse_transform(df_slice_copy) + df_slice_copy.loc[:, pd.IndexSlice[:, self.in_column]] = transformed_quantile + quantiles_arrays[quantile_column_nm] = df_slice_copy.loc[:, pd.IndexSlice[:, "target"]].rename( + columns={"target": quantile_column_nm} + ) + df.loc[:, pd.IndexSlice[:, self.in_column]] = transformed for quantile_column_nm in quantiles: df.loc[:, pd.IndexSlice[:, quantile_column_nm]] = quantiles_arrays[quantile_column_nm].values + return df - def _reshape(self, df: pd.DataFrame) -> np.ndarray: + def _preprocess_macro(self, df: pd.DataFrame) -> np.ndarray: segments = sorted(set(df.columns.get_level_values("segment"))) x = df.loc[:, pd.IndexSlice[:, self.in_column]] x = pd.concat([x[segment] for segment in segments]).values return x - def _inverse_reshape(self, df: pd.DataFrame, transformed: np.ndarray) -> np.ndarray: + def _postprocess_macro(self, df: pd.DataFrame, transformed: np.ndarray) -> np.ndarray: time_period_len = len(df) n_segments = len(set(df.columns.get_level_values("segment"))) transformed = np.concatenate( [transformed[i * time_period_len : (i + 1) * time_period_len, :] for i in range(n_segments)], axis=1 ) return transformed + + def _preprocess_per_segment(self, df: pd.DataFrame) -> np.ndarray: + self._fit_segments = cast(List[str], self._fit_segments) + transform_segments = df.columns.get_level_values("segment").unique().tolist() + check_new_segments(transform_segments=transform_segments, fit_segments=self._fit_segments) + + df = df.loc[:, pd.IndexSlice[:, self.in_column]] + to_add_segments = set(self._fit_segments) - set(transform_segments) + df_to_add = pd.DataFrame(index=df.index, columns=pd.MultiIndex.from_product([to_add_segments, self.in_column])) + df = pd.concat([df, df_to_add], axis=1) + df = df.sort_index(axis=1) + return df.values + + def _postprocess_per_segment(self, df: pd.DataFrame, transformed: np.ndarray) -> np.ndarray: + self._fit_segments = cast(List[str], self._fit_segments) + self.in_column = cast(List[str], self.in_column) + num_features = len(self.in_column) + transform_segments = set(df.columns.get_level_values("segment")) + select_segments = [segment in transform_segments for segment in self._fit_segments] + # make a mask for columns to select + select_columns = np.repeat(select_segments, num_features) + result = transformed[:, select_columns] + return result + + def _make_transform(self, df: pd.DataFrame) -> np.ndarray: + if self.mode == TransformMode.per_segment: + x = self._preprocess_per_segment(df) + transformed = self.transformer.transform(X=x) + transformed = self._postprocess_per_segment(df, transformed) + elif self.mode == TransformMode.macro: + x = self._preprocess_macro(df) + transformed = self.transformer.transform(X=x) + transformed = self._postprocess_macro(df, transformed) + else: + raise ValueError(f"'{self.mode}' is not a valid TransformMode.") + return transformed + + def _make_inverse_transform(self, df: pd.DataFrame) -> np.ndarray: + if self.mode == TransformMode.per_segment: + x = self._preprocess_per_segment(df) + transformed = self.transformer.inverse_transform(X=x) + transformed = self._postprocess_per_segment(df, transformed) + elif self.mode == TransformMode.macro: + x = self._preprocess_macro(df) + transformed = self.transformer.inverse_transform(X=x) + transformed = self._postprocess_macro(df, transformed) + else: + raise ValueError(f"'{self.mode}' is not a valid TransformMode.") + return transformed diff --git a/etna/transforms/missing_values/resample.py b/etna/transforms/missing_values/resample.py index 46827a31a..3322be686 100644 --- a/etna/transforms/missing_values/resample.py +++ b/etna/transforms/missing_values/resample.py @@ -135,13 +135,13 @@ def __init__( self.in_column = in_column self.distribution_column = distribution_column self.inplace = inplace - self.out_column = self._get_out_column(out_column) + self.out_column = out_column super().__init__( transform=_OneSegmentResampleWithDistributionTransform( in_column=in_column, distribution_column=distribution_column, inplace=inplace, - out_column=self.out_column, + out_column=self._get_out_column(self.out_column), ) ) diff --git a/etna/transforms/outliers/base.py b/etna/transforms/outliers/base.py index d68075f33..51d928cc0 100644 --- a/etna/transforms/outliers/base.py +++ b/etna/transforms/outliers/base.py @@ -9,6 +9,7 @@ from etna.datasets import TSDataset from etna.transforms.base import Transform +from etna.transforms.utils import check_new_segments class OutliersTransform(Transform, ABC): @@ -26,6 +27,7 @@ def __init__(self, in_column: str): self.in_column = in_column self.outliers_timestamps: Optional[Dict[str, List[pd.Timestamp]]] = None self.original_values: Optional[Dict[str, List[pd.Timestamp]]] = None + self._fit_segments: Optional[List[str]] = None def _save_original_values(self, ts: TSDataset): """ @@ -61,6 +63,7 @@ def fit(self, df: pd.DataFrame) -> "OutliersTransform": ts = TSDataset(df, freq=pd.infer_freq(df.index)) self.outliers_timestamps = self.detect_outliers(ts) self._save_original_values(ts) + self._fit_segments = ts.segments return self @@ -75,14 +78,25 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: Returns ------- - result: pd.DataFrame + result: dataframe with in_column series with filled with NaNs + + Raises + ------ + ValueError: + If transform isn't fitted. + NotImplementedError: + If there are segments that weren't present during training. """ if self.outliers_timestamps is None: raise ValueError("Transform is not fitted! Fit the Transform before calling transform method.") result_df = df.copy() - for segment in df.columns.get_level_values("segment").unique(): - result_df.loc[self.outliers_timestamps[segment], pd.IndexSlice[segment, self.in_column]] = np.NaN + segments = df.columns.get_level_values("segment").unique().tolist() + check_new_segments(transform_segments=segments, fit_segments=self._fit_segments) + for segment in segments: + # to locate only present indices + segment_outliers_timestamps = result_df.index.intersection(self.outliers_timestamps[segment]) + result_df.loc[segment_outliers_timestamps, pd.IndexSlice[segment, self.in_column]] = np.NaN return result_df def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: @@ -96,13 +110,22 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: Returns ------- - result: pd.DataFrame + result: data with reconstructed values + + Raises + ------ + ValueError: + If transform isn't fitted. + NotImplementedError: + If there are segments that weren't present during training. """ if self.original_values is None or self.outliers_timestamps is None: raise ValueError("Transform is not fitted! Fit the Transform before calling inverse_transform method.") result = df.copy() - for segment in self.original_values.keys(): + segments = df.columns.get_level_values("segment").unique().tolist() + check_new_segments(transform_segments=segments, fit_segments=self._fit_segments) + for segment in segments: segment_ts = result[segment, self.in_column] segment_ts[segment_ts.index.isin(self.outliers_timestamps[segment])] = self.original_values[segment] return result diff --git a/etna/transforms/timestamp/special_days.py b/etna/transforms/timestamp/special_days.py index 265257b96..8db6c8469 100644 --- a/etna/transforms/timestamp/special_days.py +++ b/etna/transforms/timestamp/special_days.py @@ -1,4 +1,6 @@ import datetime +from typing import Any +from typing import Dict from typing import Optional from typing import Tuple @@ -55,6 +57,7 @@ def __init__(self, find_special_weekday: bool = True, find_special_month_day: bo self.anomaly_week_days: Optional[Tuple[int]] = None self.anomaly_month_days: Optional[Tuple[int]] = None + self.res_type: Dict[str, Any] if self.find_special_weekday and find_special_month_day: self.res_type = {"df_sample": (0, 0), "columns": ["anomaly_weekdays", "anomaly_monthdays"]} elif self.find_special_weekday: diff --git a/etna/transforms/utils.py b/etna/transforms/utils.py index 3e6615257..8b491c752 100644 --- a/etna/transforms/utils.py +++ b/etna/transforms/utils.py @@ -1 +1,17 @@ +import reprlib +from typing import List +from typing import Optional + from etna.datasets.utils import match_target_quantiles # noqa: F401 + + +def check_new_segments(transform_segments: List[str], fit_segments: Optional[List[str]]): + """Check if there are any new segments that weren't present during training.""" + if fit_segments is None: + raise ValueError("Transform is not fitted!") + + new_segments = set(transform_segments) - set(fit_segments) + if len(new_segments) > 0: + raise NotImplementedError( + f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}" + ) diff --git a/examples/inference.ipynb b/examples/inference.ipynb index 9bd3934bf..9d6777deb 100644 --- a/examples/inference.ipynb +++ b/examples/inference.ipynb @@ -230,7 +230,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -644,7 +644,7 @@ "id": "59012669", "metadata": {}, "source": [ - "Here we explicitly set `ts=new_ts` in `load` function in order to pass it inside our `pipeline_loaded`. Otherwise, `pipeline_loaded` doesn't have any `ts` to work with.\n", + "Here we explicitly set `ts=new_ts` in `load` function in order to pass it inside our `pipeline_loaded`. Otherwise, `pipeline_loaded` doesn't have `ts` to forecast and we should explicitly call `forecast(ts=new_ts)` for making a forecast.\n", "\n", "We can also load saved model and transoform using `load`, but we shouldn't set `ts` parameter, because models and transforms don't need it." ] @@ -722,7 +722,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -744,9 +744,9 @@ { "data": { "text/plain": [ - "{'segment_d': 18.20146757117957,\n", + "{'segment_c': 25.23759225436336,\n", " 'segment_b': 4.828671629496564,\n", - " 'segment_c': 25.23759225436336,\n", + " 'segment_d': 18.20146757117957,\n", " 'segment_a': 8.73107925541017}" ] }, @@ -778,7 +778,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -805,9 +805,9 @@ { "data": { "text/plain": [ - "{'segment_d': 11.162075802124274,\n", + "{'segment_c': 18.357231604941372,\n", " 'segment_b': 4.703408652853966,\n", - " 'segment_c': 18.357231604941372,\n", + " 'segment_d': 11.162075802124274,\n", " 'segment_a': 5.587809488492237}" ] }, diff --git a/poetry.lock b/poetry.lock index 5d48c599a..736ea5126 100644 --- a/poetry.lock +++ b/poetry.lock @@ -718,7 +718,7 @@ python-versions = ">=3.7" [[package]] name = "fsspec" -version = "2022.11.0" +version = "2023.1.0" description = "File-system specification" category = "main" optional = true @@ -1409,29 +1409,30 @@ python-versions = ">=3.7" [[package]] name = "mypy" -version = "0.910" +version = "0.950" description = "Optional static typing for Python" category = "main" optional = true -python-versions = ">=3.5" +python-versions = ">=3.6" [package.dependencies] -mypy-extensions = ">=0.4.3,<0.5.0" -toml = "*" -typed-ast = {version = ">=1.4.0,<1.5.0", markers = "python_version < \"3.8\""} -typing-extensions = ">=3.7.4" +mypy-extensions = ">=0.4.3" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typed-ast = {version = ">=1.4.0,<2", markers = "python_version < \"3.8\""} +typing-extensions = ">=3.10" [package.extras] dmypy = ["psutil (>=4.0)"] -python2 = ["typed-ast (>=1.4.0,<1.5.0)"] +python2 = ["typed-ast (>=1.4.0,<2)"] +reports = ["lxml"] [[package]] name = "mypy-extensions" -version = "0.4.3" -description = "Experimental type system extensions for programs checked with the mypy typechecker." +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." category = "main" optional = true -python-versions = "*" +python-versions = ">=3.5" [[package]] name = "myst-parser" @@ -3303,7 +3304,7 @@ wandb = ["wandb"] [metadata] lock-version = "1.1" python-versions = ">=3.7.1, <3.11.0" -content-hash = "78cfffbb71287b0db8af81005d304cbb6e63ed0b725970863bc5800410b70829" +content-hash = "463b30bacf7ec13ce5181b21a964f0abac9112aed2f5769192c78bcd90c9eec3" [metadata.files] absl-py = [ @@ -3838,8 +3839,8 @@ frozenlist = [ {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"}, ] fsspec = [ - {file = "fsspec-2022.11.0-py3-none-any.whl", hash = "sha256:d6e462003e3dcdcb8c7aa84c73a228f8227e72453cd22570e2363e8844edfe7b"}, - {file = "fsspec-2022.11.0.tar.gz", hash = "sha256:259d5fd5c8e756ff2ea72f42e7613c32667dc2049a4ac3d84364a7ca034acb8b"}, + {file = "fsspec-2023.1.0-py3-none-any.whl", hash = "sha256:b833e2e541e9e8cde0ab549414187871243177feb3d344f9d27b25a93f5d8139"}, + {file = "fsspec-2023.1.0.tar.gz", hash = "sha256:fbae7f20ff801eb5f7d0bedf81f25c787c0dfac5e982d98fa3884a9cde2b5411"}, ] gitdb = [ {file = "gitdb-4.0.9-py3-none-any.whl", hash = "sha256:8033ad4e853066ba6ca92050b9df2f89301b8fc8bf7e9324d412a63f8bf1a8fd"}, @@ -4199,33 +4200,33 @@ mistune = [ ] multidict = [] mypy = [ - {file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"}, - {file = "mypy-0.910-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb"}, - {file = "mypy-0.910-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9"}, - {file = "mypy-0.910-cp35-cp35m-win_amd64.whl", hash = "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e"}, - {file = "mypy-0.910-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921"}, - {file = "mypy-0.910-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6"}, - {file = "mypy-0.910-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212"}, - {file = "mypy-0.910-cp36-cp36m-win_amd64.whl", hash = "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885"}, - {file = "mypy-0.910-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0"}, - {file = "mypy-0.910-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de"}, - {file = "mypy-0.910-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703"}, - {file = "mypy-0.910-cp37-cp37m-win_amd64.whl", hash = "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a"}, - {file = "mypy-0.910-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504"}, - {file = "mypy-0.910-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9"}, - {file = "mypy-0.910-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072"}, - {file = "mypy-0.910-cp38-cp38-win_amd64.whl", hash = "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811"}, - {file = "mypy-0.910-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e"}, - {file = "mypy-0.910-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b"}, - {file = "mypy-0.910-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2"}, - {file = "mypy-0.910-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97"}, - {file = "mypy-0.910-cp39-cp39-win_amd64.whl", hash = "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8"}, - {file = "mypy-0.910-py3-none-any.whl", hash = "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"}, - {file = "mypy-0.910.tar.gz", hash = "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150"}, + {file = "mypy-0.950-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cf9c261958a769a3bd38c3e133801ebcd284ffb734ea12d01457cb09eacf7d7b"}, + {file = "mypy-0.950-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5b5bd0ffb11b4aba2bb6d31b8643902c48f990cc92fda4e21afac658044f0c0"}, + {file = "mypy-0.950-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5e7647df0f8fc947388e6251d728189cfadb3b1e558407f93254e35abc026e22"}, + {file = "mypy-0.950-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eaff8156016487c1af5ffa5304c3e3fd183edcb412f3e9c72db349faf3f6e0eb"}, + {file = "mypy-0.950-cp310-cp310-win_amd64.whl", hash = "sha256:563514c7dc504698fb66bb1cf897657a173a496406f1866afae73ab5b3cdb334"}, + {file = "mypy-0.950-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:dd4d670eee9610bf61c25c940e9ade2d0ed05eb44227275cce88701fee014b1f"}, + {file = "mypy-0.950-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ca75ecf2783395ca3016a5e455cb322ba26b6d33b4b413fcdedfc632e67941dc"}, + {file = "mypy-0.950-cp36-cp36m-win_amd64.whl", hash = "sha256:6003de687c13196e8a1243a5e4bcce617d79b88f83ee6625437e335d89dfebe2"}, + {file = "mypy-0.950-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4c653e4846f287051599ed8f4b3c044b80e540e88feec76b11044ddc5612ffed"}, + {file = "mypy-0.950-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e19736af56947addedce4674c0971e5dceef1b5ec7d667fe86bcd2b07f8f9075"}, + {file = "mypy-0.950-cp37-cp37m-win_amd64.whl", hash = "sha256:ef7beb2a3582eb7a9f37beaf38a28acfd801988cde688760aea9e6cc4832b10b"}, + {file = "mypy-0.950-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0112752a6ff07230f9ec2f71b0d3d4e088a910fdce454fdb6553e83ed0eced7d"}, + {file = "mypy-0.950-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ee0a36edd332ed2c5208565ae6e3a7afc0eabb53f5327e281f2ef03a6bc7687a"}, + {file = "mypy-0.950-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:77423570c04aca807508a492037abbd72b12a1fb25a385847d191cd50b2c9605"}, + {file = "mypy-0.950-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5ce6a09042b6da16d773d2110e44f169683d8cc8687e79ec6d1181a72cb028d2"}, + {file = "mypy-0.950-cp38-cp38-win_amd64.whl", hash = "sha256:5b231afd6a6e951381b9ef09a1223b1feabe13625388db48a8690f8daa9b71ff"}, + {file = "mypy-0.950-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:0384d9f3af49837baa92f559d3fa673e6d2652a16550a9ee07fc08c736f5e6f8"}, + {file = "mypy-0.950-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1fdeb0a0f64f2a874a4c1f5271f06e40e1e9779bf55f9567f149466fc7a55038"}, + {file = "mypy-0.950-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:61504b9a5ae166ba5ecfed9e93357fd51aa693d3d434b582a925338a2ff57fd2"}, + {file = "mypy-0.950-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a952b8bc0ae278fc6316e6384f67bb9a396eb30aced6ad034d3a76120ebcc519"}, + {file = "mypy-0.950-cp39-cp39-win_amd64.whl", hash = "sha256:eaea21d150fb26d7b4856766e7addcf929119dd19fc832b22e71d942835201ef"}, + {file = "mypy-0.950-py3-none-any.whl", hash = "sha256:a4d9898f46446bfb6405383b57b96737dcfd0a7f25b748e78ef3e8c576bba3cb"}, + {file = "mypy-0.950.tar.gz", hash = "sha256:1b333cfbca1762ff15808a0ef4f71b5d3eed8528b23ea1c3fb50543c867d68de"}, ] mypy-extensions = [ - {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, - {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] myst-parser = [ {file = "myst-parser-0.15.2.tar.gz", hash = "sha256:f7f3b2d62db7655cde658eb5d62b2ec2a4631308137bd8d10f296a40d57bbbeb"}, diff --git a/pyproject.toml b/pyproject.toml index dcab867e4..008fd47c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,7 +100,7 @@ pep8-naming = {version = "^0.12.1", optional = true} flake8-bugbear = {version = "^22.4.25", optional = true} flake8-comprehensions = {version = "^3.9.0", optional = true} flake8-docstrings = {version = "^1.6.0", optional = true} -mypy = {version = "^0.910", optional = true} +mypy = {version = "^0.950", optional = true} types-PyYAML = {version = "^6.0.0", optional = true} codespell = {version = "^2.0.0", optional = true} @@ -269,6 +269,11 @@ markers = [ "long_2" ] +[tool.coverage.report] +exclude_lines = [ + '^ +assert_never\(.*?\)$', +] + [tool.mypy] ignore_missing_imports = true strict_optional = true diff --git a/tests/test_ensembles/test_direct_ensemble.py b/tests/test_ensembles/test_direct_ensemble.py index 031b8efdd..7edd6e320 100644 --- a/tests/test_ensembles/test_direct_ensemble.py +++ b/tests/test_ensembles/test_direct_ensemble.py @@ -9,6 +9,8 @@ from etna.models import NaiveModel from etna.pipeline import Pipeline from tests.test_pipeline.utils import assert_pipeline_equals_loaded_original +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts_with_prediction_intervals @pytest.fixture @@ -67,5 +69,18 @@ def test_predict(direct_ensemble_pipeline, simple_ts_train): pd.testing.assert_frame_equal(prediction.to_pandas(), expected_prediction.to_pandas()) -def test_save_load(direct_ensemble_pipeline, example_tsds): - assert_pipeline_equals_loaded_original(pipeline=direct_ensemble_pipeline, ts=example_tsds) +@pytest.mark.parametrize("load_ts", [True, False]) +def test_save_load(load_ts, direct_ensemble_pipeline, example_tsds): + assert_pipeline_equals_loaded_original(pipeline=direct_ensemble_pipeline, ts=example_tsds, load_ts=load_ts) + + +def test_forecast_given_ts(direct_ensemble_pipeline, example_tsds): + assert_pipeline_forecasts_given_ts( + pipeline=direct_ensemble_pipeline, ts=example_tsds, horizon=direct_ensemble_pipeline.horizon + ) + + +def test_forecast_given_ts_with_prediction_interval(direct_ensemble_pipeline, example_tsds): + assert_pipeline_forecasts_given_ts_with_prediction_intervals( + pipeline=direct_ensemble_pipeline, ts=example_tsds, horizon=direct_ensemble_pipeline.horizon + ) diff --git a/tests/test_ensembles/test_stacking_ensemble.py b/tests/test_ensembles/test_stacking_ensemble.py index c10ae042c..e706ee6ee 100644 --- a/tests/test_ensembles/test_stacking_ensemble.py +++ b/tests/test_ensembles/test_stacking_ensemble.py @@ -15,6 +15,8 @@ from etna.metrics import MAE from etna.pipeline import Pipeline from tests.test_pipeline.utils import assert_pipeline_equals_loaded_original +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts_with_prediction_intervals HORIZON = 7 @@ -138,7 +140,7 @@ def test_make_features( ensemble = StackingEnsemble( pipelines=[naive_featured_pipeline_1, naive_featured_pipeline_2], features_to_use=features_to_use ).fit(example_tsds) - x, y = ensemble._make_features(forecasts_ts, train=True) + x, y = ensemble._make_features(ts=example_tsds, forecasts=forecasts_ts, train=True) features = set(x.columns.get_level_values("feature")) assert isinstance(x, pd.DataFrame) assert isinstance(y, pd.Series) @@ -252,7 +254,7 @@ def test_forecast_calls_process_forecasts(example_tsds: TSDataset, naive_ensembl naive_ensemble.fit(ts=example_tsds) naive_ensemble._process_forecasts = MagicMock() - result = naive_ensemble._forecast() + result = naive_ensemble._forecast(ts=example_tsds) naive_ensemble._process_forecasts.assert_called_once() assert result == naive_ensemble._process_forecasts.return_value @@ -314,11 +316,24 @@ def test_backtest(stacking_ensemble_pipeline: StackingEnsemble, example_tsds: TS assert isinstance(df, pd.DataFrame) -def test_forecast_raise_error_if_not_fitted(naive_ensemble: StackingEnsemble): - """Test that StackingEnsemble raise error when calling forecast without being fit.""" - with pytest.raises(ValueError, match="StackingEnsemble is not fitted!"): +def test_forecast_raise_error_if_no_ts(naive_ensemble: StackingEnsemble): + """Test that StackingEnsemble raises error when calling forecast without ts.""" + with pytest.raises(ValueError, match="There is no ts to forecast!"): _ = naive_ensemble.forecast() -def test_save_load(stacking_ensemble_pipeline, example_tsds): - assert_pipeline_equals_loaded_original(pipeline=stacking_ensemble_pipeline, ts=example_tsds) +@pytest.mark.parametrize("load_ts", [True, False]) +def test_save_load(stacking_ensemble_pipeline, example_tsds, load_ts): + assert_pipeline_equals_loaded_original(pipeline=stacking_ensemble_pipeline, ts=example_tsds, load_ts=load_ts) + + +def test_forecast_given_ts(stacking_ensemble_pipeline, example_tsds): + assert_pipeline_forecasts_given_ts( + pipeline=stacking_ensemble_pipeline, ts=example_tsds, horizon=stacking_ensemble_pipeline.horizon + ) + + +def test_forecast_given_ts_with_prediction_interval(stacking_ensemble_pipeline, example_tsds): + assert_pipeline_forecasts_given_ts_with_prediction_intervals( + pipeline=stacking_ensemble_pipeline, ts=example_tsds, horizon=stacking_ensemble_pipeline.horizon + ) diff --git a/tests/test_ensembles/test_voting_ensemble.py b/tests/test_ensembles/test_voting_ensemble.py index 72d8d3d8a..d72f1dcc2 100644 --- a/tests/test_ensembles/test_voting_ensemble.py +++ b/tests/test_ensembles/test_voting_ensemble.py @@ -16,6 +16,8 @@ from etna.metrics import MAE from etna.pipeline import Pipeline from tests.test_pipeline.utils import assert_pipeline_equals_loaded_original +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts_with_prediction_intervals HORIZON = 7 @@ -119,7 +121,7 @@ def test_vote_default_weights(simple_df: TSDataset, naive_pipeline_1: Pipeline, ensemble = VotingEnsemble(pipelines=[naive_pipeline_1, naive_pipeline_2]) ensemble.fit(ts=simple_df) forecasts = Parallel(n_jobs=ensemble.n_jobs, backend="multiprocessing", verbose=11)( - delayed(ensemble._forecast_pipeline)(pipeline=pipeline) for pipeline in ensemble.pipelines + delayed(ensemble._forecast_pipeline)(pipeline=pipeline, ts=simple_df) for pipeline in ensemble.pipelines ) forecast = ensemble._vote(forecasts=forecasts) np.testing.assert_array_equal(forecast[:, "A", "target"].values, [47.5, 48, 47.5, 48, 47.5, 48, 47.5]) @@ -131,7 +133,7 @@ def test_vote_custom_weights(simple_df: TSDataset, naive_pipeline_1: Pipeline, n ensemble = VotingEnsemble(pipelines=[naive_pipeline_1, naive_pipeline_2], weights=[1, 3]) ensemble.fit(ts=simple_df) forecasts = Parallel(n_jobs=ensemble.n_jobs, backend="multiprocessing", verbose=11)( - delayed(ensemble._forecast_pipeline)(pipeline=pipeline) for pipeline in ensemble.pipelines + delayed(ensemble._forecast_pipeline)(pipeline=pipeline, ts=simple_df) for pipeline in ensemble.pipelines ) forecast = ensemble._vote(forecasts=forecasts) np.testing.assert_array_equal(forecast[:, "A", "target"].values, [47.25, 48, 47.25, 48, 47.25, 48, 47.25]) @@ -143,7 +145,7 @@ def test_forecast_calls_vote(example_tsds: TSDataset, naive_pipeline_1: Pipeline ensemble.fit(ts=example_tsds) ensemble._vote = MagicMock() - result = ensemble._forecast() + result = ensemble._forecast(ts=example_tsds) ensemble._vote.assert_called_once() assert result == ensemble._vote.return_value @@ -197,5 +199,18 @@ def test_backtest(voting_ensemble_pipeline: VotingEnsemble, example_tsds: TSData assert isinstance(df, pd.DataFrame) -def test_save_load(voting_ensemble_pipeline, example_tsds): - assert_pipeline_equals_loaded_original(pipeline=voting_ensemble_pipeline, ts=example_tsds) +@pytest.mark.parametrize("load_ts", [True, False]) +def test_save_load(load_ts, voting_ensemble_pipeline, example_tsds): + assert_pipeline_equals_loaded_original(pipeline=voting_ensemble_pipeline, ts=example_tsds, load_ts=load_ts) + + +def test_forecast_given_ts(voting_ensemble_pipeline, example_tsds): + assert_pipeline_forecasts_given_ts( + pipeline=voting_ensemble_pipeline, ts=example_tsds, horizon=voting_ensemble_pipeline.horizon + ) + + +def test_forecast_given_ts_with_prediction_interval(voting_ensemble_pipeline, example_tsds): + assert_pipeline_forecasts_given_ts_with_prediction_intervals( + pipeline=voting_ensemble_pipeline, ts=example_tsds, horizon=voting_ensemble_pipeline.horizon + ) diff --git a/tests/test_loggers/test_file_logger.py b/tests/test_loggers/test_file_logger.py index b1603611e..ac4d7ea83 100644 --- a/tests/test_loggers/test_file_logger.py +++ b/tests/test_loggers/test_file_logger.py @@ -249,11 +249,12 @@ def test_local_file_logger_with_stacking_ensemble(example_df): assert len(list(cur_dir.iterdir())) == 1, "we've run one experiment" current_experiment_dir = list(cur_dir.iterdir())[0] - assert len(list(current_experiment_dir.iterdir())) == 2, "crossval and crossval_results folders" + assert len(list(current_experiment_dir.iterdir())) == 4, "training, forecasting, crossval, crossval_results" - assert ( - len(list((current_experiment_dir / "crossval").iterdir())) == n_folds - ), "crossval should have `n_folds` runs" + for folder in ["training", "forecasting", "crossval"]: + assert ( + len(list((current_experiment_dir / folder).iterdir())) == n_folds + ), f"{folder} should have `n_folds` runs" tslogger.remove(idx) @@ -281,11 +282,14 @@ def test_local_file_logger_with_empirical_prediction_interval(example_df): assert len(list(cur_dir.iterdir())) == 1, "we've run one experiment" current_experiment_dir = list(cur_dir.iterdir())[0] - assert len(list(current_experiment_dir.iterdir())) == 2, "crossval and crossval_results folders" - assert ( - len(list((current_experiment_dir / "crossval").iterdir())) == n_folds - ), "crossval should have `n_folds` runs" + len(list(current_experiment_dir.iterdir())) == 4 + ), "training, forecasting, crossval, crossval_results folders" + + for folder in ["training", "forecasting", "crossval"]: + assert ( + len(list((current_experiment_dir / folder).iterdir())) == n_folds + ), f"{folder} should have `n_folds` runs" tslogger.remove(idx) diff --git a/tests/test_models/nn/test_mlp.py b/tests/test_models/nn/test_mlp.py index 639305244..aed7f9448 100644 --- a/tests/test_models/nn/test_mlp.py +++ b/tests/test_models/nn/test_mlp.py @@ -24,7 +24,6 @@ ], ) def test_mlp_model_run_weekly_overfit_with_scaler(ts_dataset_weekly_function_with_horizon, horizon): - ts_train, ts_test = ts_dataset_weekly_function_with_horizon(horizon) lag = LagTransform(in_column="target", lags=list(range(horizon, horizon + 4))) fourier = FourierTransform(period=7, order=3) @@ -80,6 +79,17 @@ def test_mlp_make_samples(simple_df_relevance): np.testing.assert_equal(df[["target"]].iloc[decoder_length : 2 * decoder_length], second_sample["decoder_target"]) +def test_mlp_forward_fail_nans(): + batch = { + "decoder_real": torch.Tensor([[torch.nan, 2, 3], [1, 2, 3], [1, 2, 3]]), + "decoder_target": torch.Tensor([[1], [2], [3]]), + "segment": "A", + } + model = MLPNet(input_size=3, hidden_size=[1], lr=1e-2, loss=nn.MSELoss(), optimizer_params=None) + with pytest.raises(ValueError, match="There are NaNs in features"): + _ = model.forward(batch) + + def test_mlp_step(): batch = { @@ -96,6 +106,17 @@ def test_mlp_step(): assert output.shape == torch.Size([3, 1]) +def test_mlp_step_fail_nans(): + batch = { + "decoder_real": torch.Tensor([[torch.nan, 2, 3], [1, 2, 3], [1, 2, 3]]), + "decoder_target": torch.Tensor([[1], [2], [3]]), + "segment": "A", + } + model = MLPNet(input_size=3, hidden_size=[1], lr=1e-2, loss=nn.MSELoss(), optimizer_params=None) + with pytest.raises(ValueError, match="There are NaNs in features"): + _ = model.step(batch) + + def test_mlp_layers(): model = MLPNet(input_size=3, hidden_size=[10], lr=1e-2, loss=None, optimizer_params=None) model_ = nn.Sequential( diff --git a/tests/test_models/test_base.py b/tests/test_models/test_base.py index 9ae93dffc..824285962 100644 --- a/tests/test_models/test_base.py +++ b/tests/test_models/test_base.py @@ -14,6 +14,7 @@ @pytest.fixture() def deep_base_model_mock(): model = MagicMock() + model.encoder_length = 10 model.train_batch_size = 32 model.train_dataloader_params = {} model.val_dataloader_params = {} @@ -23,6 +24,13 @@ def deep_base_model_mock(): return model +@pytest.fixture() +def ts_mock(): + torch_dataset = MagicMock() + torch_dataset.index.__len__.return_value = 100 + return torch_dataset + + @pytest.fixture() def sized_torch_dataset_mock(): torch_dataset = MagicMock() @@ -138,15 +146,19 @@ def test_deep_base_model_raw_predict_call(dataloader, deep_base_model_mock): np.testing.assert_allclose(predictions_dict[("segment2", "target")], batch["target"][1].numpy()) -def test_deep_base_model_forecast_inverse_transform_call_check(deep_base_model_mock): - ts = MagicMock() +def test_deep_base_model_forecast_inverse_transform_call_check(deep_base_model_mock, ts_mock): horizon = 7 - DeepBaseModel.forecast(self=deep_base_model_mock, ts=ts, prediction_size=horizon) - ts.tsdataset_idx_slice.return_value.inverse_transform.assert_called_once() + DeepBaseModel.forecast(self=deep_base_model_mock, ts=ts_mock, prediction_size=horizon) + ts_mock.tsdataset_idx_slice.return_value.inverse_transform.assert_called_once() + + +def test_deep_base_model_forecast_fail_not_enough_context(deep_base_model_mock, ts_mock): + horizon = len(ts_mock.index) + with pytest.raises(ValueError, match="Given context isn't big enough"): + _ = DeepBaseModel.forecast(self=deep_base_model_mock, ts=ts_mock, prediction_size=horizon) -def test_deep_base_model_forecast_loop(simple_df, deep_base_model_mock): - ts = MagicMock() +def test_deep_base_model_forecast_loop(simple_df, deep_base_model_mock, ts_mock): ts_after_tsdataset_idx_slice = MagicMock() horizon = 7 @@ -154,13 +166,13 @@ def test_deep_base_model_forecast_loop(simple_df, deep_base_model_mock): deep_base_model_mock.raw_predict.return_value = raw_predict ts_after_tsdataset_idx_slice.df = simple_df.df.iloc[-horizon:] - ts.tsdataset_idx_slice.return_value = ts_after_tsdataset_idx_slice + ts_mock.tsdataset_idx_slice.return_value = ts_after_tsdataset_idx_slice - future = DeepBaseModel.forecast(self=deep_base_model_mock, ts=ts, prediction_size=horizon) + future = DeepBaseModel.forecast(self=deep_base_model_mock, ts=ts_mock, prediction_size=horizon) np.testing.assert_allclose( future.df.loc[:, pd.IndexSlice["A", "target"]], raw_predict[("A", "target")][:horizon, 0] ) np.testing.assert_allclose( future.df.loc[:, pd.IndexSlice["B", "target"]], raw_predict[("B", "target")][:horizon, 0] ) - ts.tsdataset_idx_slice.return_value.inverse_transform.assert_called_once() + ts_mock.tsdataset_idx_slice.return_value.inverse_transform.assert_called_once() diff --git a/tests/test_models/test_inference/common.py b/tests/test_models/test_inference/common.py index 3314ea9d2..e70775cc7 100644 --- a/tests/test_models/test_inference/common.py +++ b/tests/test_models/test_inference/common.py @@ -1,25 +1,10 @@ -import functools - import numpy as np -import pytest from typing_extensions import get_args from etna.datasets import TSDataset from etna.models import ContextRequiredModelType -def to_be_fixed(raises, match=None): - def to_be_fixed_concrete(func): - @functools.wraps(func) - def wrapped_test(*args, **kwargs): - with pytest.raises(raises, match=match): - return func(*args, **kwargs) - - return wrapped_test - - return to_be_fixed_concrete - - def make_prediction(model, ts, prediction_size, method_name) -> TSDataset: method = getattr(model, method_name) if isinstance(model, get_args(ContextRequiredModelType)): @@ -45,6 +30,8 @@ def _test_prediction_in_sample_full(ts, model, transforms, method_name): # checking forecast_df = forecast_ts.to_pandas(flatten=True) assert not np.any(forecast_df["target"].isna()) + original_target = TSDataset.to_flatten(df)["target"] + assert not forecast_df["target"].equals(original_target) def _test_prediction_in_sample_suffix(ts, model, transforms, method_name, num_skip_points): @@ -57,10 +44,12 @@ def _test_prediction_in_sample_suffix(ts, model, transforms, method_name, num_sk # forecasting forecast_ts = TSDataset(df, freq="D") forecast_ts.transform(ts.transforms) - forecast_ts.df = forecast_ts.df.iloc[(num_skip_points - model.context_size) :] prediction_size = len(forecast_ts.index) - num_skip_points + forecast_ts.df = forecast_ts.df.iloc[(num_skip_points - model.context_size) :] forecast_ts = make_prediction(model=model, ts=forecast_ts, prediction_size=prediction_size, method_name=method_name) # checking forecast_df = forecast_ts.to_pandas(flatten=True) assert not np.any(forecast_df["target"].isna()) + original_target = TSDataset.to_flatten(df.iloc[(num_skip_points - model.context_size) :])["target"] + assert not forecast_df["target"].equals(original_target) diff --git a/tests/test_models/test_inference/test_forecast.py b/tests/test_models/test_inference/test_forecast.py index cb654ef6d..496a869c6 100644 --- a/tests/test_models/test_inference/test_forecast.py +++ b/tests/test_models/test_inference/test_forecast.py @@ -5,6 +5,7 @@ import pytest from pandas.util.testing import assert_frame_equal from pytorch_forecasting.data import GroupNormalizer +from pytorch_forecasting.data import NaNLabelEncoder from typing_extensions import get_args from etna.datasets import TSDataset @@ -28,6 +29,7 @@ from etna.models import SimpleExpSmoothingModel from etna.models import TBATSModel from etna.models.nn import DeepARModel +from etna.models.nn import MLPModel from etna.models.nn import RNNModel from etna.models.nn import TFTModel from etna.transforms import LagTransform @@ -35,7 +37,8 @@ from tests.test_models.test_inference.common import _test_prediction_in_sample_full from tests.test_models.test_inference.common import _test_prediction_in_sample_suffix from tests.test_models.test_inference.common import make_prediction -from tests.test_models.test_inference.common import to_be_fixed +from tests.utils import select_segments_subset +from tests.utils import to_be_fixed def make_forecast(model, ts, prediction_size) -> TSDataset: @@ -43,7 +46,7 @@ def make_forecast(model, ts, prediction_size) -> TSDataset: class TestForecastInSampleFullNoTarget: - """Test forecast on full train dataset with filling target with NaNs. + """Test forecast on full train dataset where target is filled with NaNs. Expected that NaNs are filled after prediction. """ @@ -83,27 +86,30 @@ def _test_forecast_in_sample_full_no_target(ts, model, transforms): def test_forecast_in_sample_full_no_target(self, model, transforms, example_tsds): self._test_forecast_in_sample_full_no_target(example_tsds, model, transforms) - @to_be_fixed(raises=AssertionError) @pytest.mark.parametrize( "model, transforms", [ - (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + (LinearPerSegmentModel(), [LagTransform(in_column="target", lags=[2, 3])]), + (LinearMultiSegmentModel(), [LagTransform(in_column="target", lags=[2, 3])]), + (ElasticPerSegmentModel(), [LagTransform(in_column="target", lags=[2, 3])]), + (ElasticMultiSegmentModel(), [LagTransform(in_column="target", lags=[2, 3])]), ], ) - def test_forecast_in_sample_full_no_target_failed(self, model, transforms, example_tsds): - self._test_forecast_in_sample_full_no_target(example_tsds, model, transforms) + def test_forecast_in_sample_full_no_target_failed_nans_sklearn(self, model, transforms, example_tsds): + with pytest.raises(ValueError, match="Input contains NaN, infinity or a value too large"): + self._test_forecast_in_sample_full_no_target(example_tsds, model, transforms) @pytest.mark.parametrize( "model, transforms", [ - (LinearPerSegmentModel(), [LagTransform(in_column="target", lags=[2, 3])]), - (LinearMultiSegmentModel(), [LagTransform(in_column="target", lags=[2, 3])]), - (ElasticPerSegmentModel(), [LagTransform(in_column="target", lags=[2, 3])]), - (ElasticMultiSegmentModel(), [LagTransform(in_column="target", lags=[2, 3])]), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), ], ) - def test_forecast_in_sample_full_no_target_failed_nans_lags(self, model, transforms, example_tsds): - with pytest.raises(ValueError, match="Input contains NaN, infinity or a value too large"): + def test_forecast_in_sample_full_no_target_failed_nans_nn(self, model, transforms, example_tsds): + with pytest.raises(ValueError, match="There are NaNs in features"): self._test_forecast_in_sample_full_no_target(example_tsds, model, transforms) @pytest.mark.parametrize( @@ -113,6 +119,7 @@ def test_forecast_in_sample_full_no_target_failed_nans_lags(self, model, transfo (NaiveModel(lag=3), []), (SeasonalMovingAverageModel(), []), (DeadlineMovingAverageModel(window=1), []), + (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), ], ) def test_forecast_in_sample_full_no_target_failed_not_enough_context(self, model, transforms, example_tsds): @@ -160,7 +167,7 @@ def test_forecast_in_sample_full_no_target_failed_not_implemented_in_sample(self class TestForecastInSampleFull: """Test forecast on full train dataset. - Expected that target values are filled after prediction. + Expected that there are no NaNs after prediction and targets are changed compared to original. """ @pytest.mark.parametrize( @@ -174,7 +181,6 @@ class TestForecastInSampleFull: (HoltModel(), []), (HoltWintersModel(), []), (SimpleExpSmoothingModel(), []), - (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), ], ) def test_forecast_in_sample_full(self, model, transforms, example_tsds): @@ -189,10 +195,23 @@ def test_forecast_in_sample_full(self, model, transforms, example_tsds): (ElasticMultiSegmentModel(), [LagTransform(in_column="target", lags=[2, 3])]), ], ) - def test_forecast_in_sample_full_failed_nans_lags(self, model, transforms, example_tsds): + def test_forecast_in_sample_full_failed_nans_sklearn(self, model, transforms, example_tsds): with pytest.raises(ValueError, match="Input contains NaN, infinity or a value too large"): _test_prediction_in_sample_full(example_tsds, model, transforms, method_name="forecast") + @pytest.mark.parametrize( + "model, transforms", + [ + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[2, 3])], + ), + ], + ) + def test_forecast_in_sample_full_failed_nans_nn(self, model, transforms, example_tsds): + with pytest.raises(ValueError, match="There are NaNs in features"): + _test_prediction_in_sample_full(example_tsds, model, transforms, method_name="forecast") + @pytest.mark.parametrize( "model, transforms", [ @@ -200,12 +219,22 @@ def test_forecast_in_sample_full_failed_nans_lags(self, model, transforms, examp (NaiveModel(lag=3), []), (SeasonalMovingAverageModel(), []), (DeadlineMovingAverageModel(window=1), []), + (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), ], ) def test_forecast_in_sample_full_failed_not_enough_context(self, model, transforms, example_tsds): with pytest.raises(ValueError, match="Given context isn't big enough"): _test_prediction_in_sample_full(example_tsds, model, transforms, method_name="forecast") + @to_be_fixed(raises=AssertionError) + # Looks like a problem of current implementation of NNs + @pytest.mark.parametrize( + "model, transforms", + [], + ) + def test_forecast_in_sample_full_failed_nans_lags_nns(self, model, transforms, example_tsds): + _test_prediction_in_sample_full(example_tsds, model, transforms, method_name="forecast") + @to_be_fixed(raises=NotImplementedError, match="It is not possible to make in-sample predictions") @pytest.mark.parametrize( "model, transforms", @@ -245,7 +274,7 @@ def test_forecast_in_sample_full_not_implemented(self, model, transforms, exampl class TestForecastInSampleSuffixNoTarget: - """Test forecast on suffix of train dataset with filling target with NaNs. + """Test forecast on suffix of train dataset where target is filled with NaNs. Expected that NaNs are filled after prediction. """ @@ -290,6 +319,10 @@ def _test_forecast_in_sample_suffix_no_target(ts, model, transforms, num_skip_po (SeasonalMovingAverageModel(), []), (DeadlineMovingAverageModel(window=1), []), (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[2, 3])], + ), ], ) def test_forecast_in_sample_suffix_no_target(self, model, transforms, example_tsds): @@ -338,7 +371,7 @@ def test_forecast_in_sample_suffix_no_target_failed_not_implemented_in_sample( class TestForecastInSampleSuffix: """Test forecast on suffix of train dataset. - Expected that target values are filled after prediction. + Expected that there are no NaNs after prediction and targets are changed compared to original. """ @pytest.mark.parametrize( @@ -361,6 +394,10 @@ class TestForecastInSampleSuffix: (SeasonalMovingAverageModel(), []), (DeadlineMovingAverageModel(window=1), []), (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[2, 3])], + ), ], ) def test_forecast_in_sample_suffix(self, model, transforms, example_tsds): @@ -486,6 +523,10 @@ def _test_forecast_out_sample_prefix(ts, model, transforms, full_prediction_size ], ), (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), ], ) def test_forecast_out_sample_prefix(self, model, transforms, example_tsds): @@ -516,7 +557,7 @@ def _test_forecast_out_sample_suffix(ts, model, transforms, full_prediction_size # firstly we should forecast prefix to use it as a context forecast_prefix_ts = deepcopy(forecast_gap_ts) forecast_prefix_ts.df = forecast_prefix_ts.df.iloc[:-suffix_prediction_size] - model.forecast(forecast_prefix_ts, prediction_size=prediction_size_diff) + forecast_prefix_ts = model.forecast(forecast_prefix_ts, prediction_size=prediction_size_diff) forecast_gap_ts.df = forecast_gap_ts.df.combine_first(forecast_prefix_ts.df) # forecast suffix with known context for it @@ -551,20 +592,28 @@ def _test_forecast_out_sample_suffix(ts, model, transforms, full_prediction_size (SeasonalMovingAverageModel(), []), (NaiveModel(lag=3), []), (DeadlineMovingAverageModel(window=1), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), ], ) def test_forecast_out_sample_suffix(self, model, transforms, example_tsds): self._test_forecast_out_sample_suffix(example_tsds, model, transforms) - @to_be_fixed(raises=AssertionError) @pytest.mark.parametrize( "model, transforms", [ (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), ], ) - def test_forecast_out_sample_suffix_failed(self, model, transforms, example_tsds): - self._test_forecast_out_sample_suffix(example_tsds, model, transforms) + def test_forecast_out_sample_suffix_failed_rnn(self, model, transforms, example_tsds): + """This test is expected to fail due to autoregression in RNN. + + More about it in issue: https://github.com/tinkoff-ai/etna/issues/1087 + """ + with pytest.raises(AssertionError): + self._test_forecast_out_sample_suffix(example_tsds, model, transforms) @to_be_fixed(raises=NotImplementedError, match="You can only forecast from the next point after the last one") @pytest.mark.parametrize( @@ -605,7 +654,7 @@ def test_forecast_out_sample_suffix_failed_not_implemented(self, model, transfor class TestForecastMixedInOutSample: """Test forecast on mixture of in-sample and out-sample. - Expected that target values are filled after prediction. + Expected that there are no NaNs after prediction and targets are changed compared to original. """ @staticmethod @@ -628,6 +677,8 @@ def _test_forecast_mixed_in_out_sample(ts, model, transforms, num_skip_points=50 # checking forecast_full_df = forecast_full_ts.to_pandas(flatten=True) assert not np.any(forecast_full_df["target"].isna()) + original_target = TSDataset.to_flatten(df_full.iloc[(num_skip_points - model.context_size) :])["target"] + assert not forecast_full_df["target"].equals(original_target) @pytest.mark.parametrize( "model, transforms", @@ -649,6 +700,10 @@ def _test_forecast_mixed_in_out_sample(ts, model, transforms, num_skip_points=50 (NaiveModel(lag=3), []), (DeadlineMovingAverageModel(window=1), []), (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), ], ) def test_forecast_mixed_in_out_sample(self, model, transforms, example_tsds): @@ -690,3 +745,205 @@ def test_forecast_mixed_in_out_sample(self, model, transforms, example_tsds): ) def test_forecast_mixed_in_out_sample_failed_not_implemented_in_sample(self, model, transforms, example_tsds): self._test_forecast_mixed_in_out_sample(example_tsds, model, transforms) + + +class TestForecastSubsetSegments: + """Test forecast on subset of segments. + + Expected that predictions on subset of segments match subset of predictions on full dataset. + """ + + def _test_forecast_subset_segments(self, ts, model, transforms, segments, prediction_size=5): + # select subset of tsdataset + segments = list(set(segments)) + subset_ts = select_segments_subset(ts=deepcopy(ts), segments=segments) + + # fitting + ts.fit_transform(transforms) + subset_ts.transform(ts.transforms) + model.fit(ts) + + # forecasting full + import torch # TODO: remove after fix at issue-802 + + torch.manual_seed(11) + + forecast_full_ts = ts.make_future(future_steps=prediction_size, tail_steps=model.context_size) + forecast_full_ts = make_forecast(model=model, ts=forecast_full_ts, prediction_size=prediction_size) + + # forecasting subset of segments + torch.manual_seed(11) # TODO: remove after fix at issue-802 + + forecast_subset_ts = subset_ts.make_future(future_steps=prediction_size, tail_steps=model.context_size) + forecast_subset_ts = make_forecast(model=model, ts=forecast_subset_ts, prediction_size=prediction_size) + + # checking + forecast_full_df = forecast_full_ts.to_pandas() + forecast_subset_df = forecast_subset_ts.to_pandas() + assert_frame_equal(forecast_subset_df, forecast_full_df.loc[:, pd.IndexSlice[segments, :]]) + + @pytest.mark.parametrize( + "model, transforms", + [ + (CatBoostModelPerSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (CatBoostModelMultiSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (AutoARIMAModel(), []), + (ProphetModel(), []), + (SARIMAXModel(), []), + (HoltModel(), []), + (HoltWintersModel(), []), + (SimpleExpSmoothingModel(), []), + (MovingAverageModel(window=3), []), + (SeasonalMovingAverageModel(), []), + (NaiveModel(lag=3), []), + (DeadlineMovingAverageModel(window=1), []), + (BATSModel(use_trend=True), []), + (TBATSModel(use_trend=True), []), + ( + TFTModel(max_epochs=1, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=21, + min_encoder_length=21, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + static_categoricals=["segment"], + target_normalizer=None, + ) + ], + ), + (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), + ], + ) + def test_forecast_subset_segments(self, model, transforms, example_tsds): + self._test_forecast_subset_segments(example_tsds, model, transforms, segments=["segment_2"]) + + @to_be_fixed(raises=AssertionError) + # issue with explanation: https://github.com/tinkoff-ai/etna/issues/1089 + @pytest.mark.parametrize( + "model, transforms", + [ + ( + DeepARModel(max_epochs=1, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=5, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + target_normalizer=GroupNormalizer(groups=["segment"]), + ) + ], + ), + ], + ) + def test_forecast_subset_segments_failed_assertion_error(self, model, transforms, example_tsds): + self._test_forecast_subset_segments(example_tsds, model, transforms, segments=["segment_2"]) + + +class TestForecastNewSegments: + """Test forecast on new segments. + + Expected that target values are filled after prediction. + """ + + def _test_forecast_new_segments(self, ts, model, transforms, train_segments, prediction_size=5): + # create tsdataset with new segments + train_segments = list(set(train_segments)) + forecast_segments = list(set(ts.segments) - set(train_segments)) + train_ts = select_segments_subset(ts=deepcopy(ts), segments=train_segments) + test_ts = select_segments_subset(ts=deepcopy(ts), segments=forecast_segments) + + # fitting + train_ts.fit_transform(transforms) + test_ts.transform(train_ts.transforms) + model.fit(train_ts) + + # forecasting + import torch # TODO: remove after fix at issue-802 + + torch.manual_seed(11) + + forecast_ts = test_ts.make_future(future_steps=prediction_size, tail_steps=model.context_size) + forecast_ts = make_forecast(model=model, ts=forecast_ts, prediction_size=prediction_size) + + # checking + forecast_df = forecast_ts.to_pandas(flatten=True) + assert not np.any(forecast_df["target"].isna()) + + @pytest.mark.parametrize( + "model, transforms", + [ + (CatBoostModelMultiSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (MovingAverageModel(window=3), []), + (SeasonalMovingAverageModel(), []), + (NaiveModel(lag=3), []), + (DeadlineMovingAverageModel(window=1), []), + (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), + ( + DeepARModel(max_epochs=1, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=5, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + categorical_encoders={"segment": NaNLabelEncoder(add_nan=True, warn=False)}, + target_normalizer=GroupNormalizer(groups=["segment"]), + ) + ], + ), + ( + TFTModel(max_epochs=1, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=21, + min_encoder_length=21, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + categorical_encoders={"segment": NaNLabelEncoder(add_nan=True, warn=False)}, + static_categoricals=["segment"], + target_normalizer=None, + ) + ], + ), + ], + ) + def test_forecast_new_segments(self, model, transforms, example_tsds): + self._test_forecast_new_segments(example_tsds, model, transforms, train_segments=["segment_1"]) + + @pytest.mark.parametrize( + "model, transforms", + [ + (CatBoostModelPerSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (AutoARIMAModel(), []), + (ProphetModel(), []), + (SARIMAXModel(), []), + (HoltModel(), []), + (HoltWintersModel(), []), + (SimpleExpSmoothingModel(), []), + (BATSModel(use_trend=True), []), + (TBATSModel(use_trend=True), []), + ], + ) + def test_forecast_new_segments_failed_per_segment(self, model, transforms, example_tsds): + with pytest.raises(NotImplementedError, match="Per-segment models can't make predictions on new segments"): + self._test_forecast_new_segments(example_tsds, model, transforms, train_segments=["segment_1"]) diff --git a/tests/test_models/test_inference/test_predict.py b/tests/test_models/test_inference/test_predict.py index 1d89d36ef..5fb0eaf7b 100644 --- a/tests/test_models/test_inference/test_predict.py +++ b/tests/test_models/test_inference/test_predict.py @@ -1,8 +1,11 @@ +from copy import deepcopy + import numpy as np import pandas as pd import pytest from pandas.util.testing import assert_frame_equal from pytorch_forecasting.data import GroupNormalizer +from pytorch_forecasting.data import NaNLabelEncoder from etna.datasets import TSDataset from etna.models import AutoARIMAModel @@ -24,6 +27,7 @@ from etna.models import SimpleExpSmoothingModel from etna.models import TBATSModel from etna.models.nn import DeepARModel +from etna.models.nn import MLPModel from etna.models.nn import RNNModel from etna.models.nn import TFTModel from etna.transforms import LagTransform @@ -31,7 +35,8 @@ from tests.test_models.test_inference.common import _test_prediction_in_sample_full from tests.test_models.test_inference.common import _test_prediction_in_sample_suffix from tests.test_models.test_inference.common import make_prediction -from tests.test_models.test_inference.common import to_be_fixed +from tests.utils import select_segments_subset +from tests.utils import to_be_fixed def make_predict(model, ts, prediction_size) -> TSDataset: @@ -41,7 +46,7 @@ def make_predict(model, ts, prediction_size) -> TSDataset: class TestPredictInSampleFull: """Test predict on full train dataset. - Expected that target values are filled after prediction. + Expected that there are no NaNs after prediction and targets are changed compared to original. """ @pytest.mark.parametrize( @@ -69,7 +74,7 @@ def test_predict_in_sample_full(self, model, transforms, example_tsds): (ElasticMultiSegmentModel(), [LagTransform(in_column="target", lags=[2, 3])]), ], ) - def test_predict_in_sample_full_failed_not_enough_context(self, model, transforms, example_tsds): + def test_predict_in_sample_full_failed_nans_sklearn(self, model, transforms, example_tsds): with pytest.raises(ValueError, match="Input contains NaN, infinity or a value too large"): _test_prediction_in_sample_full(example_tsds, model, transforms, method_name="predict") @@ -119,6 +124,10 @@ def test_predict_in_sample_full_failed_not_enough_context(self, model, transform ], ), (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[2, 3])], + ), ], ) def test_predict_in_sample_full_failed_not_implemented_predict(self, model, transforms, example_tsds): @@ -136,7 +145,7 @@ def test_predict_in_sample_full_failed_not_implemented_in_sample(self, model, tr class TestPredictInSampleSuffix: """Test predict on suffix of train dataset. - Expected that target values are filled after prediction. + Expected that there are no NaNs after prediction and targets are changed compared to original. """ @pytest.mark.parametrize( @@ -196,6 +205,10 @@ def test_predict_in_sample_suffix(self, model, transforms, example_tsds): ], ), (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[2, 3])], + ), ], ) def test_predict_in_sample_full_failed_not_implemented_predict(self, model, transforms, example_tsds): @@ -213,13 +226,14 @@ def test_predict_in_sample_suffix_failed_not_implemented_in_sample(self, model, class TestPredictOutSample: """Test predict on future dataset. - Expected that target values are filled after prediction. + Expected that there are no NaNs after prediction and targets are changed compared to original. """ @staticmethod def _test_predict_out_sample(ts, model, transforms, prediction_size=5): - train_ts, future_ts = ts.train_test_split(test_size=prediction_size) + train_ts, _ = ts.train_test_split(test_size=prediction_size) forecast_ts = TSDataset(df=ts.df, freq=ts.freq) + df = forecast_ts.to_pandas() # fitting train_ts.fit_transform(transforms) @@ -234,6 +248,8 @@ def _test_predict_out_sample(ts, model, transforms, prediction_size=5): # checking forecast_df = forecast_ts.to_pandas(flatten=True) assert not np.any(forecast_df["target"].isna()) + original_target = TSDataset.to_flatten(df.iloc[-to_remain:])["target"] + assert not forecast_df["target"].equals(original_target) @pytest.mark.parametrize( "model, transforms", @@ -292,12 +308,217 @@ def test_predict_out_sample(self, model, transforms, example_tsds): ], ), (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), ], ) def test_predict_out_sample_failed_not_implemented_predict(self, model, transforms, example_tsds): self._test_predict_out_sample(example_tsds, model, transforms) +class TestPredictOutSamplePrefix: + """Test predict on prefix of future dataset. + + Expected that predictions on prefix match prefix of predictions on full future dataset. + """ + + @staticmethod + def _test_predict_out_sample_prefix(ts, model, transforms, full_prediction_size=5, prefix_prediction_size=3): + prediction_size_diff = full_prediction_size - prefix_prediction_size + train_ts, _ = ts.train_test_split(test_size=full_prediction_size) + forecast_full_ts = TSDataset(df=ts.df, freq=ts.freq) + forecast_prefix_ts = TSDataset(df=ts.df, freq=ts.freq) + + # fitting + train_ts.fit_transform(transforms) + model.fit(train_ts) + + # forecasting full + forecast_full_ts.transform(train_ts.transforms) + to_remain = model.context_size + full_prediction_size + forecast_full_ts.df = forecast_full_ts.df.iloc[-to_remain:] + forecast_full_ts = make_predict(model=model, ts=forecast_full_ts, prediction_size=full_prediction_size) + + # forecasting only prefix + forecast_prefix_ts.transform(train_ts.transforms) + forecast_prefix_ts.df = forecast_prefix_ts.df.iloc[-to_remain:-prediction_size_diff] + forecast_prefix_ts = make_predict(model=model, ts=forecast_prefix_ts, prediction_size=prefix_prediction_size) + + # checking + forecast_full_df = forecast_full_ts.to_pandas() + forecast_prefix_df = forecast_prefix_ts.to_pandas() + assert_frame_equal(forecast_prefix_df, forecast_full_df.iloc[:prefix_prediction_size]) + + @pytest.mark.parametrize( + "model, transforms", + [ + (CatBoostModelPerSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (CatBoostModelMultiSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (AutoARIMAModel(), []), + (ProphetModel(), []), + (SARIMAXModel(), []), + (HoltModel(), []), + (HoltWintersModel(), []), + (SimpleExpSmoothingModel(), []), + (MovingAverageModel(window=3), []), + (SeasonalMovingAverageModel(), []), + (NaiveModel(lag=3), []), + (DeadlineMovingAverageModel(window=1), []), + ], + ) + def test_predict_out_sample_prefix(self, model, transforms, example_tsds): + self._test_predict_out_sample_prefix(example_tsds, model, transforms) + + @to_be_fixed(raises=NotImplementedError, match="Method predict isn't currently implemented") + @pytest.mark.parametrize( + "model, transforms", + [ + (BATSModel(use_trend=True), []), + (TBATSModel(use_trend=True), []), + ( + DeepARModel(max_epochs=5, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=5, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + target_normalizer=GroupNormalizer(groups=["segment"]), + ) + ], + ), + ( + TFTModel(max_epochs=1, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=21, + min_encoder_length=21, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + static_categoricals=["segment"], + target_normalizer=None, + ) + ], + ), + (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), + ], + ) + def test_predict_out_sample_prefix_failed_not_implemented_predict(self, model, transforms, example_tsds): + self._test_predict_out_sample_prefix(example_tsds, model, transforms) + + +class TestPredictOutSampleSuffix: + """Test predict on suffix of future dataset. + + Expected that predictions on suffix match suffix of predictions on full future dataset. + """ + + @staticmethod + def _test_predict_out_sample_suffix(ts, model, transforms, full_prediction_size=5, suffix_prediction_size=3): + prediction_size_diff = full_prediction_size - suffix_prediction_size + train_ts, _ = ts.train_test_split(test_size=full_prediction_size) + forecast_full_ts = TSDataset(df=ts.df, freq=ts.freq) + forecast_suffix_ts = TSDataset(df=ts.df, freq=ts.freq) + + # fitting + train_ts.fit_transform(transforms) + model.fit(train_ts) + + # forecasting full + forecast_full_ts.transform(train_ts.transforms) + to_remain = model.context_size + full_prediction_size + forecast_full_ts.df = forecast_full_ts.df.iloc[-to_remain:] + forecast_full_ts = make_predict(model=model, ts=forecast_full_ts, prediction_size=full_prediction_size) + + # forecasting only suffix + forecast_suffix_ts.transform(train_ts.transforms) + to_remain = model.context_size + suffix_prediction_size + forecast_suffix_ts.df = forecast_suffix_ts.df.iloc[-to_remain:] + forecast_suffix_ts = make_predict(model=model, ts=forecast_suffix_ts, prediction_size=suffix_prediction_size) + + # checking + forecast_full_df = forecast_full_ts.to_pandas() + forecast_suffix_df = forecast_suffix_ts.to_pandas() + assert_frame_equal(forecast_suffix_df, forecast_full_df.iloc[prediction_size_diff:]) + + @pytest.mark.parametrize( + "model, transforms", + [ + (CatBoostModelPerSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (CatBoostModelMultiSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (AutoARIMAModel(), []), + (ProphetModel(), []), + (SARIMAXModel(), []), + (HoltModel(), []), + (HoltWintersModel(), []), + (SimpleExpSmoothingModel(), []), + (MovingAverageModel(window=3), []), + (SeasonalMovingAverageModel(), []), + (NaiveModel(lag=3), []), + (DeadlineMovingAverageModel(window=1), []), + ], + ) + def test_predict_out_sample_suffix(self, model, transforms, example_tsds): + self._test_predict_out_sample_suffix(example_tsds, model, transforms) + + @to_be_fixed(raises=NotImplementedError, match="Method predict isn't currently implemented") + @pytest.mark.parametrize( + "model, transforms", + [ + (BATSModel(use_trend=True), []), + (TBATSModel(use_trend=True), []), + ( + DeepARModel(max_epochs=5, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=5, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + target_normalizer=GroupNormalizer(groups=["segment"]), + ) + ], + ), + ( + TFTModel(max_epochs=1, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=21, + min_encoder_length=21, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + static_categoricals=["segment"], + target_normalizer=None, + ) + ], + ), + (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), + ], + ) + def test_predict_out_sample_suffix_failed_not_implemented_predict(self, model, transforms, example_tsds): + self._test_predict_out_sample_suffix(example_tsds, model, transforms) + + class TestPredictMixedInOutSample: """Test predict on mixture of in-sample and out-sample. @@ -403,7 +624,227 @@ def test_predict_mixed_in_out_sample(self, model, transforms, example_tsds): ], ), (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), ], ) def test_predict_mixed_in_out_sample_failed_not_implemented_predict(self, model, transforms, example_tsds): self._test_predict_mixed_in_out_sample(example_tsds, model, transforms) + + +class TestPredictSubsetSegments: + """Test predict on subset of segments on suffix of train dataset. + + Expected that predictions on subset of segments match subset of predictions on full dataset. + """ + + def _test_predict_subset_segments(self, ts, model, transforms, segments, num_skip_points=50): + prediction_size = len(ts.index) - num_skip_points + + # select subset of tsdataset + segments = list(set(segments)) + subset_ts = select_segments_subset(ts=deepcopy(ts), segments=segments) + + # fitting + ts.fit_transform(transforms) + subset_ts.transform(ts.transforms) + model.fit(ts) + + # forecasting full + import torch # TODO: remove after fix at issue-802 + + torch.manual_seed(11) + + ts.df = ts.df.iloc[(num_skip_points - model.context_size) :] + forecast_full_ts = make_predict(model=model, ts=ts, prediction_size=prediction_size) + + # forecasting subset of segments + torch.manual_seed(11) # TODO: remove after fix at issue-802 + + subset_ts.df = subset_ts.df.iloc[(num_skip_points - model.context_size) :] + forecast_subset_ts = make_predict(model=model, ts=subset_ts, prediction_size=prediction_size) + + # checking + forecast_full_df = forecast_full_ts.to_pandas() + forecast_subset_df = forecast_subset_ts.to_pandas() + assert_frame_equal(forecast_subset_df, forecast_full_df.loc[:, pd.IndexSlice[segments, :]]) + + @pytest.mark.parametrize( + "model, transforms", + [ + (CatBoostModelPerSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (CatBoostModelMultiSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (AutoARIMAModel(), []), + (ProphetModel(), []), + (SARIMAXModel(), []), + (HoltModel(), []), + (HoltWintersModel(), []), + (SimpleExpSmoothingModel(), []), + (MovingAverageModel(window=3), []), + (SeasonalMovingAverageModel(), []), + (NaiveModel(lag=3), []), + (DeadlineMovingAverageModel(window=1), []), + ], + ) + def test_predict_subset_segments(self, model, transforms, example_tsds): + self._test_predict_subset_segments(example_tsds, model, transforms, segments=["segment_2"]) + + @to_be_fixed(raises=NotImplementedError, match="Method predict isn't currently implemented") + @pytest.mark.parametrize( + "model, transforms", + [ + (BATSModel(use_trend=True), []), + (TBATSModel(use_trend=True), []), + ( + DeepARModel(max_epochs=5, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=5, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + target_normalizer=GroupNormalizer(groups=["segment"]), + ) + ], + ), + ( + TFTModel(max_epochs=1, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=21, + min_encoder_length=21, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + static_categoricals=["segment"], + target_normalizer=None, + ) + ], + ), + (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), + ], + ) + def test_predict_subset_segments_failed_not_implemented_predict(self, model, transforms, example_tsds): + self._test_predict_subset_segments(example_tsds, model, transforms, segments=["segment_2"]) + + +class TestPredictNewSegments: + """Test predict on new segments on suffix of train dataset. + + Expected that there are no NaNs after prediction and targets are changed compared to original. + """ + + def _test_predict_new_segments(self, ts, model, transforms, train_segments, num_skip_points=50): + # create tsdataset with new segments + train_segments = list(set(train_segments)) + forecast_segments = list(set(ts.segments) - set(train_segments)) + train_ts = select_segments_subset(ts=deepcopy(ts), segments=train_segments) + test_ts = select_segments_subset(ts=deepcopy(ts), segments=forecast_segments) + df = test_ts.to_pandas() + + # fitting + train_ts.fit_transform(transforms) + test_ts.transform(train_ts.transforms) + model.fit(train_ts) + + # forecasting + import torch # TODO: remove after fix at issue-802 + + torch.manual_seed(11) + + test_ts.df = test_ts.df.iloc[(num_skip_points - model.context_size) :] + prediction_size = len(ts.index) - num_skip_points + forecast_ts = make_predict(model=model, ts=test_ts, prediction_size=prediction_size) + + # checking + forecast_df = forecast_ts.to_pandas(flatten=True) + assert not np.any(forecast_df["target"].isna()) + original_target = TSDataset.to_flatten(df.iloc[(num_skip_points - model.context_size) :])["target"] + assert not forecast_df["target"].equals(original_target) + + @pytest.mark.parametrize( + "model, transforms", + [ + (CatBoostModelMultiSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticMultiSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (MovingAverageModel(window=3), []), + (SeasonalMovingAverageModel(), []), + (NaiveModel(lag=3), []), + (DeadlineMovingAverageModel(window=1), []), + ], + ) + def test_predict_new_segments(self, model, transforms, example_tsds): + self._test_predict_new_segments(example_tsds, model, transforms, train_segments=["segment_1"]) + + @to_be_fixed(raises=NotImplementedError, match="Method predict isn't currently implemented") + @pytest.mark.parametrize( + "model, transforms", + [ + ( + DeepARModel(max_epochs=1, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=5, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + categorical_encoders={"segment": NaNLabelEncoder(add_nan=True, warn=False)}, + target_normalizer=GroupNormalizer(groups=["segment"]), + ) + ], + ), + ( + TFTModel(max_epochs=1, learning_rate=[0.01]), + [ + PytorchForecastingTransform( + max_encoder_length=21, + min_encoder_length=21, + max_prediction_length=5, + time_varying_known_reals=["time_idx"], + time_varying_unknown_reals=["target"], + categorical_encoders={"segment": NaNLabelEncoder(add_nan=True, warn=False)}, + static_categoricals=["segment"], + target_normalizer=None, + ) + ], + ), + (RNNModel(input_size=1, encoder_length=7, decoder_length=7, trainer_params=dict(max_epochs=1)), []), + ( + MLPModel(input_size=2, hidden_size=[10], decoder_length=7, trainer_params=dict(max_epochs=1)), + [LagTransform(in_column="target", lags=[5, 6])], + ), + ], + ) + def test_predict_new_segments_failed_not_implemented_predict(self, model, transforms, example_tsds): + self._test_predict_new_segments(example_tsds, model, transforms, train_segments=["segment_1"]) + + @pytest.mark.parametrize( + "model, transforms", + [ + (CatBoostModelPerSegment(), [LagTransform(in_column="target", lags=[5, 6])]), + (LinearPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (ElasticPerSegmentModel(), [LagTransform(in_column="target", lags=[5, 6])]), + (AutoARIMAModel(), []), + (ProphetModel(), []), + (SARIMAXModel(), []), + (HoltModel(), []), + (HoltWintersModel(), []), + (SimpleExpSmoothingModel(), []), + (BATSModel(use_trend=True), []), + (TBATSModel(use_trend=True), []), + ], + ) + def test_predict_new_segments_failed_per_segment(self, model, transforms, example_tsds): + with pytest.raises(NotImplementedError, match="Per-segment models can't make predictions on new segments"): + self._test_predict_new_segments(example_tsds, model, transforms, train_segments=["segment_1"]) diff --git a/tests/test_models/test_simple_models.py b/tests/test_models/test_simple_models.py index 37365d455..3d30231de 100644 --- a/tests/test_models/test_simple_models.py +++ b/tests/test_models/test_simple_models.py @@ -3,14 +3,13 @@ import pytest from etna.datasets import TSDataset +from etna.datasets import generate_ar_df from etna.metrics import MAE from etna.models.deadline_ma import DeadlineMovingAverageModel from etna.models.deadline_ma import SeasonalityMode -from etna.models.deadline_ma import _DeadlineMovingAverageModel from etna.models.moving_average import MovingAverageModel from etna.models.naive import NaiveModel from etna.models.seasonal_ma import SeasonalMovingAverageModel -from etna.models.seasonal_ma import _SeasonalMovingAverageModel from etna.pipeline import Pipeline from tests.test_models.utils import assert_model_equals_loaded_original @@ -124,9 +123,10 @@ def test_sma_model_predict_fail_nans_in_context(simple_df): ], ) def test_deadline_get_context_beginning_ok(freq, periods, start, prediction_size, seasonality, window, expected): - df = pd.DataFrame({"timestamp": pd.date_range(start=start, periods=periods, freq=freq)}) + timestamp = pd.date_range(start=start, periods=periods, freq=freq) + df = pd.DataFrame({"target": 1}, index=timestamp) - obtained = _DeadlineMovingAverageModel._get_context_beginning(df, prediction_size, seasonality, window) + obtained = DeadlineMovingAverageModel._get_context_beginning(df, prediction_size, seasonality, window) assert obtained == expected @@ -151,10 +151,11 @@ def test_deadline_get_context_beginning_ok(freq, periods, start, prediction_size def test_deadline_get_context_beginning_fail_not_enough_context( freq, periods, start, prediction_size, seasonality, window ): - df = pd.DataFrame({"timestamp": pd.date_range(start=start, periods=periods, freq=freq)}) + timestamp = pd.date_range(start=start, periods=periods, freq=freq) + df = pd.DataFrame({"target": 1}, index=timestamp) with pytest.raises(ValueError, match="Given context isn't big enough"): - _ = _DeadlineMovingAverageModel._get_context_beginning(df, prediction_size, seasonality, window) + _ = DeadlineMovingAverageModel._get_context_beginning(df, prediction_size, seasonality, window) @pytest.mark.parametrize("model", [DeadlineMovingAverageModel]) @@ -167,6 +168,14 @@ def test_deadline_model_predict(simple_df, model): _check_predict(ts=simple_df, model=model(window=1), prediction_size=7) +def test_deadline_model_fit_fail_not_supported_freq(): + df = generate_ar_df(start_time="2020-01-01", periods=100, freq="2D") + ts = TSDataset(df=TSDataset.to_dataset(df), freq="2D") + model = DeadlineMovingAverageModel(window=1000) + with pytest.raises(ValueError, match="Freq 2D is not supported"): + model.fit(ts) + + def test_deadline_model_forecast_fail_not_enough_context(simple_df): model = DeadlineMovingAverageModel(window=1000) model.fit(simple_df) @@ -179,7 +188,7 @@ def test_deadline_model_predict_fail_not_enough_context(simple_df): model = DeadlineMovingAverageModel(window=1000) model.fit(simple_df) with pytest.raises(ValueError, match="Given context isn't big enough"): - _ = model.forecast(simple_df, prediction_size=7) + _ = model.predict(simple_df, prediction_size=7) def test_deadline_model_forecast_fail_nans_in_context(simple_df): @@ -199,6 +208,25 @@ def test_deadline_model_predict_fail_nans_in_context(simple_df): _ = model.predict(simple_df, prediction_size=7) +def test_deadline_model_context_size_fail_not_fitted(simple_df): + model = DeadlineMovingAverageModel(window=1000) + with pytest.raises(ValueError, match="Model is not fitted"): + _ = model.context_size + + +def test_deadline_model_forecast_fail_not_fitted(simple_df): + model = DeadlineMovingAverageModel(window=1000) + future_ts = simple_df.make_future(future_steps=7, tail_steps=100) + with pytest.raises(ValueError, match="Model is not fitted"): + _ = model.forecast(future_ts, prediction_size=7) + + +def test_deadline_model_predict_fail_not_fitted(simple_df): + model = DeadlineMovingAverageModel(window=1000) + with pytest.raises(ValueError, match="Model is not fitted"): + _ = model.predict(simple_df, prediction_size=7) + + def test_seasonal_moving_average_forecast_correct(simple_df): model = SeasonalMovingAverageModel(window=3, seasonality=7) model.fit(simple_df) @@ -207,19 +235,20 @@ def test_seasonal_moving_average_forecast_correct(simple_df): res = res.to_pandas(flatten=True)[["target", "segment", "timestamp"]] df1 = pd.DataFrame() - df1["target"] = np.arange(35, 42) + df1["target"] = np.arange(35, 42, dtype=float) df1["segment"] = "A" df1["timestamp"] = pd.date_range(start="2020-02-19", periods=7) df2 = pd.DataFrame() - df2["target"] = [0, 2, 4, 6, 8, 10, 12] + df2["target"] = [0.0, 2, 4, 6, 8, 10, 12] df2["segment"] = "B" df2["timestamp"] = pd.date_range(start="2020-02-19", periods=7) answer = pd.concat([df2, df1], axis=0, ignore_index=True) - res = res.sort_values(by=["segment", "timestamp"]) - answer = answer.sort_values(by=["segment", "timestamp"]) - assert np.all(res.values == answer.values) + res = res.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + answer = answer.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + + pd.testing.assert_frame_equal(res, answer) def test_naive_forecast_correct(simple_df): @@ -230,20 +259,20 @@ def test_naive_forecast_correct(simple_df): res = res.to_pandas(flatten=True)[["target", "segment", "timestamp"]] df1 = pd.DataFrame() - df1["target"] = [46, 47, 48] * 2 + [46] + df1["target"] = [46.0, 47, 48] * 2 + [46] df1["segment"] = "A" df1["timestamp"] = pd.date_range(start="2020-02-19", periods=7) df2 = pd.DataFrame() - df2["target"] = [8, 10, 12] * 2 + [8] + df2["target"] = [8.0, 10, 12] * 2 + [8] df2["segment"] = "B" df2["timestamp"] = pd.date_range(start="2020-02-19", periods=7) answer = pd.concat([df2, df1], axis=0, ignore_index=True) - res = res.sort_values(by=["segment", "timestamp"]) - answer = answer.sort_values(by=["segment", "timestamp"]) + res = res.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + answer = answer.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) - assert np.all(res.values == answer.values) + pd.testing.assert_frame_equal(res, answer) def test_moving_average_forecast_correct(simple_df): @@ -254,7 +283,7 @@ def test_moving_average_forecast_correct(simple_df): res = res.to_pandas(flatten=True)[["target", "segment", "timestamp"]] df1 = pd.DataFrame() - tmp = np.arange(44, 49) + tmp = np.arange(44, 49, dtype=float) for i in range(7): tmp = np.append(tmp, [tmp[-5:].mean()]) df1["target"] = tmp[-7:] @@ -262,7 +291,7 @@ def test_moving_average_forecast_correct(simple_df): df1["timestamp"] = pd.date_range(start="2020-02-19", periods=7) df2 = pd.DataFrame() - tmp = np.arange(0, 13, 2) + tmp = np.arange(0, 13, 2, dtype=float) for i in range(7): tmp = np.append(tmp, [tmp[-5:].mean()]) df2["target"] = tmp[-7:] @@ -270,10 +299,10 @@ def test_moving_average_forecast_correct(simple_df): df2["timestamp"] = pd.date_range(start="2020-02-19", periods=7) answer = pd.concat([df2, df1], axis=0, ignore_index=True) - res = res.sort_values(by=["segment", "timestamp"]) - answer = answer.sort_values(by=["segment", "timestamp"]) + res = res.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + answer = answer.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) - assert np.all(res.values == answer.values) + pd.testing.assert_frame_equal(res, answer) def test_deadline_moving_average_forecast_correct(df): @@ -340,9 +369,10 @@ def test_deadline_moving_average_forecast_correct(df): df2["timestamp"] = pd.date_range(start="2020-05-20", periods=20) answer = pd.concat([df2, df1], axis=0, ignore_index=True) - res = res.sort_values(by=["segment", "timestamp"]) - answer = answer.sort_values(by=["segment", "timestamp"]) - assert np.all(res.values == answer.values) + res = res.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + answer = answer.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + + pd.testing.assert_frame_equal(res, answer) def test_seasonal_moving_average_predict_correct(simple_df): @@ -352,19 +382,20 @@ def test_seasonal_moving_average_predict_correct(simple_df): res = res.to_pandas(flatten=True)[["target", "segment", "timestamp"]] df1 = pd.DataFrame() - df1["target"] = np.arange(39, 46) + df1["target"] = np.arange(39, 46, dtype=float) df1["segment"] = "A" df1["timestamp"] = pd.date_range(start="2020-02-12", periods=7) df2 = pd.DataFrame() - df2["target"] = [8, 10, 5, 7, 2, 4, 6] + df2["target"] = [8.0, 10, 5, 7, 2, 4, 6] df2["segment"] = "B" df2["timestamp"] = pd.date_range(start="2020-02-12", periods=7) answer = pd.concat([df2, df1], axis=0, ignore_index=True) - res = res.sort_values(by=["segment", "timestamp"]) - answer = answer.sort_values(by=["segment", "timestamp"]) - assert np.all(res.values == answer.values) + res = res.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + answer = answer.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + + pd.testing.assert_frame_equal(res, answer) def test_naive_predict_correct(simple_df): @@ -374,20 +405,20 @@ def test_naive_predict_correct(simple_df): res = res.to_pandas(flatten=True)[["target", "segment", "timestamp"]] df1 = pd.DataFrame() - df1["target"] = np.arange(39, 46) + df1["target"] = np.arange(39, 46, dtype=float) df1["segment"] = "A" df1["timestamp"] = pd.date_range(start="2020-02-12", periods=7) df2 = pd.DataFrame() - df2["target"] = [8, 10, 12, 0, 2, 4, 6] + df2["target"] = [8.0, 10, 12, 0, 2, 4, 6] df2["segment"] = "B" df2["timestamp"] = pd.date_range(start="2020-02-12", periods=7) answer = pd.concat([df2, df1], axis=0, ignore_index=True) - res = res.sort_values(by=["segment", "timestamp"]) - answer = answer.sort_values(by=["segment", "timestamp"]) + res = res.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + answer = answer.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) - assert np.all(res.values == answer.values) + pd.testing.assert_frame_equal(res, answer) def test_moving_average_predict_correct(simple_df): @@ -397,7 +428,7 @@ def test_moving_average_predict_correct(simple_df): res = res.to_pandas(flatten=True)[["target", "segment", "timestamp"]] df1 = pd.DataFrame() - df1["target"] = np.arange(39, 46) + df1["target"] = np.arange(39, 46, dtype=float) df1["segment"] = "A" df1["timestamp"] = pd.date_range(start="2020-02-12", periods=7) @@ -407,10 +438,10 @@ def test_moving_average_predict_correct(simple_df): df2["timestamp"] = pd.date_range(start="2020-02-12", periods=7) answer = pd.concat([df2, df1], axis=0, ignore_index=True) - res = res.sort_values(by=["segment", "timestamp"]) - answer = answer.sort_values(by=["segment", "timestamp"]) + res = res.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + answer = answer.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) - assert np.all(res.values == answer.values) + pd.testing.assert_frame_equal(res, answer) def test_deadline_moving_average_predict_correct(df): @@ -476,9 +507,10 @@ def test_deadline_moving_average_predict_correct(df): df2["timestamp"] = pd.date_range(start="2020-04-30", periods=20) answer = pd.concat([df2, df1], axis=0, ignore_index=True) - res = res.sort_values(by=["segment", "timestamp"]) - answer = answer.sort_values(by=["segment", "timestamp"]) - assert np.all(res.values == answer.values) + res = res.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + answer = answer.sort_values(by=["segment", "timestamp"]).reset_index(drop=True) + + pd.testing.assert_frame_equal(res, answer) @pytest.mark.parametrize( @@ -530,32 +562,18 @@ def test_context_size_deadline_ma(model, freq, expected_context_size): @pytest.mark.parametrize( "etna_model_class", - (SeasonalMovingAverageModel, MovingAverageModel, NaiveModel, DeadlineMovingAverageModel), -) -def test_get_model_before_training(etna_model_class): - """Check that get_model method throws an error if per-segment model is not fitted yet.""" - etna_model = etna_model_class() - with pytest.raises(ValueError, match="Can not get the dict with base models, the model is not fitted!"): - _ = etna_model.get_model() - - -@pytest.mark.parametrize( - "etna_model_class,expected_class", ( - (NaiveModel, _SeasonalMovingAverageModel), - (SeasonalMovingAverageModel, _SeasonalMovingAverageModel), - (MovingAverageModel, _SeasonalMovingAverageModel), - (DeadlineMovingAverageModel, _DeadlineMovingAverageModel), + NaiveModel, + SeasonalMovingAverageModel, + MovingAverageModel, + DeadlineMovingAverageModel, ), ) -def test_get_model_after_training(example_tsds, etna_model_class, expected_class): - """Check that get_model method returns dict of objects of _SeasonalMovingAverageModel class.""" +def test_get_model(example_tsds, etna_model_class): pipeline = Pipeline(model=etna_model_class()) pipeline.fit(ts=example_tsds) - models_dict = pipeline.model.get_model() - assert isinstance(models_dict, dict) - for segment in example_tsds.segments: - assert isinstance(models_dict[segment], expected_class) + model = pipeline.model.get_model() + assert isinstance(model, etna_model_class) @pytest.fixture diff --git a/tests/test_pipeline/conftest.py b/tests/test_pipeline/conftest.py index e3dbb3464..eb88f93a4 100644 --- a/tests/test_pipeline/conftest.py +++ b/tests/test_pipeline/conftest.py @@ -8,6 +8,7 @@ from etna.datasets import TSDataset from etna.models import CatBoostPerSegmentModel +from etna.models import NaiveModel from etna.pipeline import Pipeline from etna.transforms import LagTransform @@ -25,6 +26,16 @@ def catboost_pipeline() -> Pipeline: return pipeline +@pytest.fixture +def naive_pipeline() -> Pipeline: + """Generate pipeline with NaiveModel.""" + pipeline = Pipeline( + model=NaiveModel(lag=7), + horizon=7, + ) + return pipeline + + @pytest.fixture def catboost_pipeline_big() -> Pipeline: """Generate pipeline with CatBoostPerSegmentModel.""" @@ -218,7 +229,7 @@ def masked_ts() -> TSDataset: @pytest.fixture -def ts_run_fold() -> TSDataset: +def ts_process_fold_forecast() -> TSDataset: timerange = pd.date_range(start="2020-01-01", periods=11).to_list() df = pd.DataFrame({"timestamp": timerange + timerange}) df["segment"] = ["segment_0"] * 11 + ["segment_1"] * 11 diff --git a/tests/test_pipeline/test_autoregressive_pipeline.py b/tests/test_pipeline/test_autoregressive_pipeline.py index 939d3bad4..7c9b77006 100644 --- a/tests/test_pipeline/test_autoregressive_pipeline.py +++ b/tests/test_pipeline/test_autoregressive_pipeline.py @@ -27,6 +27,8 @@ from etna.transforms import LagTransform from etna.transforms import LinearTrendTransform from tests.test_pipeline.utils import assert_pipeline_equals_loaded_original +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts_with_prediction_intervals DEFAULT_METRICS = [MAE(mode=MetricAggregationMode.per_segment)] @@ -75,7 +77,7 @@ def test_private_forecast_context_ignorant_model(model_class, example_tsds): with patch.object(TSDataset, "make_future", make_future): pipeline = AutoRegressivePipeline(model=model, horizon=5, step=1) pipeline.fit(example_tsds) - _ = pipeline._forecast() + _ = pipeline._forecast(ts=example_tsds) assert make_future.mock.call_count == 5 make_future.mock.assert_called_with(future_steps=pipeline.step) @@ -97,7 +99,7 @@ def test_private_forecast_context_required_model(model_class, example_tsds): with patch.object(TSDataset, "make_future", make_future): pipeline = AutoRegressivePipeline(model=model, horizon=5, step=1) pipeline.fit(example_tsds) - _ = pipeline._forecast() + _ = pipeline._forecast(ts=example_tsds) assert make_future.mock.call_count == 5 make_future.mock.assert_called_with(future_steps=pipeline.step, tail_steps=model.context_size) @@ -197,10 +199,10 @@ def test_forecast_with_fit_transforms(example_tsds): pipeline.forecast() -def test_forecast_raise_error_if_not_fitted(): - """Test that AutoRegressivePipeline raise error when calling forecast without being fit.""" +def test_forecast_raise_error_if_no_ts(): + """Test that AutoRegressivePipeline raises error when calling forecast without ts.""" pipeline = AutoRegressivePipeline(model=LinearPerSegmentModel(), horizon=5) - with pytest.raises(ValueError, match="AutoRegressivePipeline is not fitted!"): + with pytest.raises(ValueError, match="There is no ts to forecast!"): _ = pipeline.forecast() @@ -275,6 +277,7 @@ def test_predict(model, transforms, example_tsds): assert len(result_df) == len(example_tsds.segments) * num_points +@pytest.mark.parametrize("load_ts", [True, False]) @pytest.mark.parametrize( "model, transforms", [ @@ -291,7 +294,51 @@ def test_predict(model, transforms, example_tsds): (ProphetModel(), []), ], ) -def test_save_load(model, transforms, example_tsds): +def test_save_load(load_ts, model, transforms, example_tsds): horizon = 3 pipeline = AutoRegressivePipeline(model=model, transforms=transforms, horizon=horizon, step=1) - assert_pipeline_equals_loaded_original(pipeline=pipeline, ts=example_tsds) + assert_pipeline_equals_loaded_original(pipeline=pipeline, ts=example_tsds, load_ts=load_ts) + + +@pytest.mark.parametrize( + "model, transforms", + [ + ( + CatBoostMultiSegmentModel(iterations=100), + [DateFlagsTransform(), LagTransform(in_column="target", lags=list(range(3, 10)))], + ), + ( + LinearPerSegmentModel(), + [DateFlagsTransform(), LagTransform(in_column="target", lags=list(range(3, 10)))], + ), + (SeasonalMovingAverageModel(window=2, seasonality=7), []), + (SARIMAXModel(), []), + (ProphetModel(), []), + ], +) +def test_forecast_given_ts(model, transforms, example_tsds): + horizon = 3 + pipeline = AutoRegressivePipeline(model=model, transforms=transforms, horizon=horizon) + assert_pipeline_forecasts_given_ts(pipeline=pipeline, ts=example_tsds, horizon=horizon) + + +@pytest.mark.parametrize( + "model, transforms", + [ + ( + CatBoostMultiSegmentModel(iterations=100), + [DateFlagsTransform(), LagTransform(in_column="target", lags=list(range(3, 10)))], + ), + ( + LinearPerSegmentModel(), + [DateFlagsTransform(), LagTransform(in_column="target", lags=list(range(3, 10)))], + ), + (SeasonalMovingAverageModel(window=2, seasonality=7), []), + (SARIMAXModel(), []), + (ProphetModel(), []), + ], +) +def test_forecast_given_ts_with_prediction_interval(model, transforms, example_tsds): + horizon = 3 + pipeline = AutoRegressivePipeline(model=model, transforms=transforms, horizon=horizon) + assert_pipeline_forecasts_given_ts_with_prediction_intervals(pipeline=pipeline, ts=example_tsds, horizon=horizon) diff --git a/tests/test_pipeline/test_hierarchical_pipeline.py b/tests/test_pipeline/test_hierarchical_pipeline.py index 8424a29a1..c7622b642 100644 --- a/tests/test_pipeline/test_hierarchical_pipeline.py +++ b/tests/test_pipeline/test_hierarchical_pipeline.py @@ -1,20 +1,29 @@ +import pathlib from unittest.mock import Mock +from unittest.mock import patch import numpy as np +import pandas as pd import pytest from etna.datasets.utils import match_target_quantiles from etna.metrics import MAE from etna.metrics import Coverage from etna.metrics import Width +from etna.models import CatBoostMultiSegmentModel from etna.models import LinearPerSegmentModel from etna.models import NaiveModel +from etna.models import ProphetModel from etna.pipeline.hierarchical_pipeline import HierarchicalPipeline from etna.reconciliation import BottomUpReconciliator from etna.reconciliation import TopDownReconciliator +from etna.transforms import DateFlagsTransform from etna.transforms import LagTransform from etna.transforms import LinearTrendTransform from etna.transforms import MeanTransform +from tests.test_pipeline.utils import assert_pipeline_equals_loaded_original +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts_with_prediction_intervals @pytest.mark.parametrize( @@ -79,7 +88,7 @@ def test_raw_forecast_correctness(market_level_constant_hierarchical_ts, reconci model = NaiveModel() pipeline = HierarchicalPipeline(reconciliator=reconciliator, model=model, transforms=[], horizon=1) pipeline.fit(ts=market_level_constant_hierarchical_ts) - forecast = pipeline.raw_forecast() + forecast = pipeline.raw_forecast(ts=market_level_constant_hierarchical_ts) np.testing.assert_array_almost_equal(forecast[..., "target"].values, answer) @@ -94,7 +103,7 @@ def test_raw_forecast_level(market_level_simple_hierarchical_ts, reconciliator): model = NaiveModel() pipeline = HierarchicalPipeline(reconciliator=reconciliator, model=model, transforms=[], horizon=1) pipeline.fit(ts=market_level_simple_hierarchical_ts) - forecast = pipeline.raw_forecast() + forecast = pipeline.raw_forecast(ts=market_level_simple_hierarchical_ts) assert forecast.current_df_level == pipeline.reconciliator.source_level @@ -260,3 +269,156 @@ def test_interval_metrics(product_level_constant_hierarchical_ts, metric_type, r forecast_params={"prediction_interval": True, "n_folds": 1}, ) np.testing.assert_array_almost_equal(results[metric.name], answer) + + +@patch("etna.pipeline.pipeline.Pipeline.save") +def test_save(save_mock, product_level_constant_hierarchical_ts, tmp_path): + ts = product_level_constant_hierarchical_ts + model = NaiveModel() + reconciliator = BottomUpReconciliator(target_level="market", source_level="product") + pipeline = HierarchicalPipeline(reconciliator=reconciliator, model=model, transforms=[], horizon=1) + dir_path = pathlib.Path(tmp_path) + path = dir_path / "dummy.zip" + pipeline.fit(ts) + + def check_no_fit_ts(path): + assert not hasattr(pipeline, "_fit_ts") + + save_mock.side_effect = check_no_fit_ts + + pipeline.save(path) + + save_mock.assert_called_once_with(path=path) + assert hasattr(pipeline, "_fit_ts") + + +@patch("etna.pipeline.pipeline.Pipeline.load") +def test_load_no_ts(load_mock, product_level_constant_hierarchical_ts, tmp_path): + ts = product_level_constant_hierarchical_ts + model = NaiveModel() + reconciliator = BottomUpReconciliator(target_level="market", source_level="product") + pipeline = HierarchicalPipeline(reconciliator=reconciliator, model=model, transforms=[], horizon=1) + dir_path = pathlib.Path(tmp_path) + path = dir_path / "dummy.zip" + pipeline.fit(ts) + + pipeline.save(path) + loaded_pipeline = HierarchicalPipeline.load(path) + + load_mock.assert_called_once_with(path=path) + assert loaded_pipeline._fit_ts is None + assert loaded_pipeline.ts is None + assert loaded_pipeline == load_mock.return_value + + +@patch("etna.pipeline.pipeline.Pipeline.load") +def test_load_with_ts(load_mock, product_level_constant_hierarchical_ts, tmp_path): + ts = product_level_constant_hierarchical_ts + model = NaiveModel() + reconciliator = BottomUpReconciliator(target_level="market", source_level="product") + pipeline = HierarchicalPipeline(reconciliator=reconciliator, model=model, transforms=[], horizon=1) + dir_path = pathlib.Path(tmp_path) + path = dir_path / "dummy.zip" + pipeline.fit(ts) + + pipeline.save(path) + loaded_pipeline = HierarchicalPipeline.load(path, ts=ts) + + load_mock.assert_called_once_with(path=path) + load_mock.return_value.reconciliator.aggregate.assert_called_once_with(ts=ts) + pd.testing.assert_frame_equal(loaded_pipeline._fit_ts.to_pandas(), ts.to_pandas()) + assert loaded_pipeline.ts == load_mock.return_value.reconciliator.aggregate.return_value + + +@pytest.mark.parametrize( + "reconciliator", + ( + TopDownReconciliator(target_level="product", source_level="market", period=1, method="AHP"), + TopDownReconciliator(target_level="product", source_level="market", period=1, method="PHA"), + BottomUpReconciliator(target_level="market", source_level="product"), + BottomUpReconciliator(target_level="total", source_level="market"), + ), +) +@pytest.mark.parametrize( + "model, transforms", + [ + ( + CatBoostMultiSegmentModel(iterations=100), + [DateFlagsTransform(), LagTransform(in_column="target", lags=[1])], + ), + ( + LinearPerSegmentModel(), + [DateFlagsTransform(), LagTransform(in_column="target", lags=[1])], + ), + (NaiveModel(), []), + (ProphetModel(), []), + ], +) +def test_save_load(model, transforms, reconciliator, product_level_constant_hierarchical_ts): + horizon = 1 + pipeline = HierarchicalPipeline(reconciliator=reconciliator, model=model, transforms=transforms, horizon=horizon) + assert_pipeline_equals_loaded_original(pipeline=pipeline, ts=product_level_constant_hierarchical_ts) + + +@pytest.mark.parametrize( + "reconciliator", + ( + TopDownReconciliator(target_level="product", source_level="market", period=1, method="AHP"), + TopDownReconciliator(target_level="product", source_level="market", period=1, method="PHA"), + BottomUpReconciliator(target_level="market", source_level="product"), + BottomUpReconciliator(target_level="total", source_level="market"), + ), +) +@pytest.mark.parametrize( + "model, transforms", + [ + ( + CatBoostMultiSegmentModel(iterations=100), + [DateFlagsTransform(), LagTransform(in_column="target", lags=[1])], + ), + ( + LinearPerSegmentModel(), + [DateFlagsTransform(), LagTransform(in_column="target", lags=[1])], + ), + (NaiveModel(), []), + (ProphetModel(), []), + ], +) +def test_forecast_given_ts(model, transforms, reconciliator, product_level_constant_hierarchical_ts): + horizon = 1 + pipeline = HierarchicalPipeline(reconciliator=reconciliator, model=model, transforms=transforms, horizon=horizon) + assert_pipeline_forecasts_given_ts(pipeline=pipeline, ts=product_level_constant_hierarchical_ts, horizon=horizon) + + +@pytest.mark.parametrize( + "reconciliator", + ( + TopDownReconciliator(target_level="product", source_level="market", period=1, method="AHP"), + TopDownReconciliator(target_level="product", source_level="market", period=1, method="PHA"), + BottomUpReconciliator(target_level="market", source_level="product"), + BottomUpReconciliator(target_level="total", source_level="market"), + ), +) +@pytest.mark.parametrize( + "model, transforms", + [ + ( + CatBoostMultiSegmentModel(iterations=100), + [DateFlagsTransform(), LagTransform(in_column="target", lags=[1])], + ), + ( + LinearPerSegmentModel(), + [DateFlagsTransform(), LagTransform(in_column="target", lags=[1])], + ), + (NaiveModel(), []), + (ProphetModel(), []), + ], +) +def test_forecast_given_ts_with_prediction_interval( + model, transforms, reconciliator, product_level_constant_hierarchical_ts +): + horizon = 1 + pipeline = HierarchicalPipeline(reconciliator=reconciliator, model=model, transforms=transforms, horizon=horizon) + assert_pipeline_forecasts_given_ts_with_prediction_intervals( + pipeline=pipeline, ts=product_level_constant_hierarchical_ts, horizon=horizon, n_folds=2 + ) diff --git a/tests/test_pipeline/test_pipeline.py b/tests/test_pipeline/test_pipeline.py index 31358459e..5f2a13818 100644 --- a/tests/test_pipeline/test_pipeline.py +++ b/tests/test_pipeline/test_pipeline.py @@ -1,5 +1,4 @@ from copy import deepcopy -from datetime import datetime from typing import Dict from typing import List from unittest.mock import MagicMock @@ -14,7 +13,6 @@ from etna.metrics import MAE from etna.metrics import MSE from etna.metrics import SMAPE -from etna.metrics import Metric from etna.metrics import MetricAggregationMode from etna.metrics import Width from etna.models import CatBoostMultiSegmentModel @@ -30,13 +28,17 @@ from etna.models.base import PredictionIntervalContextRequiredAbstractModel from etna.pipeline import FoldMask from etna.pipeline import Pipeline +from etna.pipeline.base import CrossValidationMode from etna.transforms import AddConstTransform from etna.transforms import DateFlagsTransform +from etna.transforms import DifferencingTransform from etna.transforms import FilterFeaturesTransform from etna.transforms import LagTransform from etna.transforms import LogTransform from etna.transforms import TimeSeriesImputerTransform from tests.test_pipeline.utils import assert_pipeline_equals_loaded_original +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts +from tests.test_pipeline.utils import assert_pipeline_forecasts_given_ts_with_prediction_intervals from tests.utils import DummyMetric DEFAULT_METRICS = [MAE(mode=MetricAggregationMode.per_segment)] @@ -107,7 +109,7 @@ def test_private_forecast_context_ignorant_model(model_class): pipeline = Pipeline(model=model, horizon=5) pipeline.fit(ts) - _ = pipeline._forecast() + _ = pipeline._forecast(ts=ts) ts.make_future.assert_called_with(future_steps=pipeline.horizon) model.forecast.assert_called_with(ts=ts.make_future()) @@ -122,7 +124,7 @@ def test_private_forecast_context_required_model(model_class): pipeline = Pipeline(model=model, horizon=5) pipeline.fit(ts) - _ = pipeline._forecast() + _ = pipeline._forecast(ts=ts) ts.make_future.assert_called_with(future_steps=pipeline.horizon, tail_steps=model.context_size) model.forecast.assert_called_with(ts=ts.make_future(), prediction_size=pipeline.horizon) @@ -166,7 +168,7 @@ def test_forecast_with_intervals_other_model(base_forecast, model_class): pipeline = Pipeline(model=model, horizon=5) pipeline.fit(ts) _ = pipeline.forecast(prediction_interval=True, quantiles=(0.025, 0.975)) - base_forecast.assert_called_with(prediction_interval=True, quantiles=(0.025, 0.975), n_folds=3) + base_forecast.assert_called_with(ts=ts, prediction_interval=True, quantiles=(0.025, 0.975), n_folds=3) def test_forecast(example_tsds): @@ -221,7 +223,7 @@ def test_forecast_prediction_interval_builtin(example_tsds, model): @pytest.mark.parametrize("model", (MovingAverageModel(), LinearPerSegmentModel())) -def test_forecast_prediction_interval_interface(example_tsds, model): +def test_forecast_prediction_interval_not_builtin(example_tsds, model): """Test the forecast interface for the models without built-in prediction intervals.""" pipeline = Pipeline(model=model, transforms=[DateFlagsTransform()], horizon=5) pipeline.fit(example_tsds) @@ -232,7 +234,7 @@ def test_forecast_prediction_interval_interface(example_tsds, model): assert (segment_slice["target_0.975"] - segment_slice["target_0.025"] >= 0).all() -def test_forecast_prediction_interval(splited_piecewise_constant_ts): +def test_forecast_prediction_interval_correct_values(splited_piecewise_constant_ts): """Test that the prediction interval for piecewise-constant dataset is correct.""" train, test = splited_piecewise_constant_ts pipeline = Pipeline(model=NaiveModel(lag=1), transforms=[], horizon=5) @@ -279,106 +281,86 @@ def test_forecast_prediction_interval_noise(constant_ts, constant_noisy_ts): @pytest.mark.parametrize("n_folds", (0, -1)) def test_invalid_n_folds(catboost_pipeline: Pipeline, n_folds: int, example_tsdf: TSDataset): """Test Pipeline.backtest behavior in case of invalid n_folds.""" - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Folds number should be a positive number"): _ = catboost_pipeline.backtest(ts=example_tsdf, metrics=DEFAULT_METRICS, n_folds=n_folds) -def test_validate_backtest_dataset(catboost_pipeline_big: Pipeline, imbalanced_tsdf: TSDataset): - """Test Pipeline.backtest behavior in case of small dataframe that - can't be divided to required number of splits. - """ - with pytest.raises(ValueError): - _ = catboost_pipeline_big.backtest(ts=imbalanced_tsdf, n_folds=3, metrics=DEFAULT_METRICS) - +@pytest.mark.parametrize( + "min_size, n_folds, horizon, stride", + [ + (1, 10, 1, 1), + (9, 10, 1, 1), + (10, 10, 2, 1), + (19, 10, 2, 2), + (28, 10, 2, 3), + ], +) +def test_invalid_backtest_dataset_size(min_size, n_folds, horizon, stride): + """Test Pipeline.backtest behavior in case of too small dataframe for given number of folds.""" + df = generate_ar_df(start_time="2020-01-01", periods=100, n_segments=2, freq="D") + df_wide = TSDataset.to_dataset(df) + to_remove = len(df_wide) - min_size + df_wide.iloc[:to_remove, 0] = np.NaN + ts = TSDataset(df=df_wide, freq="D") + pipeline = Pipeline(model=NaiveModel(lag=horizon), horizon=horizon) + + with pytest.raises(ValueError, match="All the series from feature dataframe should contain at least .* timestamps"): + _ = pipeline.backtest(ts=ts, n_folds=n_folds, stride=stride, metrics=DEFAULT_METRICS) + + +def test_invalid_backtest_metrics_empty(catboost_pipeline: Pipeline, example_tsdf: TSDataset): + """Test Pipeline.backtest behavior in case of empty metrics.""" + with pytest.raises(ValueError, match="At least one metric required"): + _ = catboost_pipeline.backtest(ts=example_tsdf, metrics=[], n_folds=2) + + +def test_invalid_backtest_metrics_macro(catboost_pipeline: Pipeline, example_tsdf: TSDataset): + """Test Pipeline.backtest behavior in case of macro metrics.""" + with pytest.raises(ValueError, match="All the metrics should be in"): + _ = catboost_pipeline.backtest(ts=example_tsdf, metrics=[MAE(mode=MetricAggregationMode.macro)], n_folds=2) + + +def test_invalid_backtest_mode_set_on_fold_mask(catboost_pipeline: Pipeline, example_tsdf: TSDataset): + """Test Pipeline.backtest behavior on setting mode with fold masks.""" + masks = [ + FoldMask( + first_train_timestamp="2020-01-01", + last_train_timestamp="2020-04-03", + target_timestamps=["2020-04-04", "2020-04-05", "2020-04-06"], + ), + FoldMask( + first_train_timestamp="2020-01-01", + last_train_timestamp="2020-04-06", + target_timestamps=["2020-04-07", "2020-04-08", "2020-04-09"], + ), + ] + with pytest.raises(ValueError, match="Mode shouldn't be set if n_folds are fold masks"): + _ = catboost_pipeline.backtest(ts=example_tsdf, n_folds=masks, mode="expand", metrics=DEFAULT_METRICS) + + +def test_invalid_backtest_stride_set_on_fold_mask(catboost_pipeline: Pipeline, example_tsdf: TSDataset): + """Test Pipeline.backtest behavior on setting stride with fold masks.""" + masks = [ + FoldMask( + first_train_timestamp="2020-01-01", + last_train_timestamp="2020-04-03", + target_timestamps=["2020-04-04", "2020-04-05", "2020-04-06"], + ), + FoldMask( + first_train_timestamp="2020-01-01", + last_train_timestamp="2020-04-06", + target_timestamps=["2020-04-07", "2020-04-08", "2020-04-09"], + ), + ] + with pytest.raises(ValueError, match="Stride shouldn't be set if n_folds are fold masks"): + _ = catboost_pipeline.backtest(ts=example_tsdf, n_folds=masks, stride=2, metrics=DEFAULT_METRICS) -@pytest.mark.parametrize("metrics", ([], [MAE(mode=MetricAggregationMode.macro)])) -def test_invalid_backtest_metrics(catboost_pipeline: Pipeline, metrics: List[Metric], example_tsdf: TSDataset): - """Test Pipeline.backtest behavior in case of invalid metrics.""" - with pytest.raises(ValueError): - _ = catboost_pipeline.backtest(ts=example_tsdf, metrics=metrics, n_folds=2) - -def test_generate_expandable_timeranges_days(): - """Test train-test timeranges generation in expand mode with daily freq""" - df = pd.DataFrame({"timestamp": pd.date_range("2021-01-01", "2021-04-01")}) - df["segment"] = "seg" - df["target"] = 1 - df = df.pivot(index="timestamp", columns="segment").reorder_levels([1, 0], axis=1).sort_index(axis=1) - df.columns.names = ["segment", "feature"] - ts = TSDataset(df, freq="D") - - true_borders = ( - (("2021-01-01", "2021-02-24"), ("2021-02-25", "2021-03-08")), - (("2021-01-01", "2021-03-08"), ("2021-03-09", "2021-03-20")), - (("2021-01-01", "2021-03-20"), ("2021-03-21", "2021-04-01")), - ) - masks = Pipeline._generate_masks_from_n_folds(ts=ts, n_folds=3, horizon=12, mode="expand") - for i, stage_dfs in enumerate(Pipeline._generate_folds_datasets(ts, masks=masks, horizon=12)): - for stage_df, borders in zip(stage_dfs, true_borders[i]): - assert stage_df.index.min() == datetime.strptime(borders[0], "%Y-%m-%d").date() - assert stage_df.index.max() == datetime.strptime(borders[1], "%Y-%m-%d").date() - - -def test_generate_expandable_timeranges_hours(): - """Test train-test timeranges generation in expand mode with hour freq""" - df = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", "2020-02-01", freq="H")}) - df["segment"] = "seg" - df["target"] = 1 - df = df.pivot(index="timestamp", columns="segment").reorder_levels([1, 0], axis=1).sort_index(axis=1) - df.columns.names = ["segment", "feature"] - ts = TSDataset(df, freq="H") - - true_borders = ( - (("2020-01-01 00:00:00", "2020-01-30 12:00:00"), ("2020-01-30 13:00:00", "2020-01-31 00:00:00")), - (("2020-01-01 00:00:00", "2020-01-31 00:00:00"), ("2020-01-31 01:00:00", "2020-01-31 12:00:00")), - (("2020-01-01 00:00:00", "2020-01-31 12:00:00"), ("2020-01-31 13:00:00", "2020-02-01 00:00:00")), - ) - masks = Pipeline._generate_masks_from_n_folds(ts=ts, n_folds=3, horizon=12, mode="expand") - for i, stage_dfs in enumerate(Pipeline._generate_folds_datasets(ts, horizon=12, masks=masks)): - for stage_df, borders in zip(stage_dfs, true_borders[i]): - assert stage_df.index.min() == datetime.strptime(borders[0], "%Y-%m-%d %H:%M:%S").date() - assert stage_df.index.max() == datetime.strptime(borders[1], "%Y-%m-%d %H:%M:%S").date() - - -def test_generate_constant_timeranges_days(): - """Test train-test timeranges generation with constant mode with daily freq""" - df = pd.DataFrame({"timestamp": pd.date_range("2021-01-01", "2021-04-01")}) - df["segment"] = "seg" - df["target"] = 1 - df = df.pivot(index="timestamp", columns="segment").reorder_levels([1, 0], axis=1).sort_index(axis=1) - df.columns.names = ["segment", "feature"] - ts = TSDataset(df, freq="D") - - true_borders = ( - (("2021-01-01", "2021-02-24"), ("2021-02-25", "2021-03-08")), - (("2021-01-13", "2021-03-08"), ("2021-03-09", "2021-03-20")), - (("2021-01-25", "2021-03-20"), ("2021-03-21", "2021-04-01")), - ) - masks = Pipeline._generate_masks_from_n_folds(ts=ts, n_folds=3, horizon=12, mode="constant") - for i, stage_dfs in enumerate(Pipeline._generate_folds_datasets(ts, horizon=12, masks=masks)): - for stage_df, borders in zip(stage_dfs, true_borders[i]): - assert stage_df.index.min() == datetime.strptime(borders[0], "%Y-%m-%d").date() - assert stage_df.index.max() == datetime.strptime(borders[1], "%Y-%m-%d").date() - - -def test_generate_constant_timeranges_hours(): - """Test train-test timeranges generation with constant mode with hours freq""" - df = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", "2020-02-01", freq="H")}) - df["segment"] = "seg" - df["target"] = 1 - df = df.pivot(index="timestamp", columns="segment").reorder_levels([1, 0], axis=1).sort_index(axis=1) - df.columns.names = ["segment", "feature"] - ts = TSDataset(df, freq="H") - true_borders = ( - (("2020-01-01 00:00:00", "2020-01-30 12:00:00"), ("2020-01-30 13:00:00", "2020-01-31 00:00:00")), - (("2020-01-01 12:00:00", "2020-01-31 00:00:00"), ("2020-01-31 01:00:00", "2020-01-31 12:00:00")), - (("2020-01-02 00:00:00", "2020-01-31 12:00:00"), ("2020-01-31 13:00:00", "2020-02-01 00:00:00")), - ) - masks = Pipeline._generate_masks_from_n_folds(ts=ts, n_folds=3, horizon=12, mode="constant") - for i, stage_dfs in enumerate(Pipeline._generate_folds_datasets(ts, horizon=12, masks=masks)): - for stage_df, borders in zip(stage_dfs, true_borders[i]): - assert stage_df.index.min() == datetime.strptime(borders[0], "%Y-%m-%d %H:%M:%S").date() - assert stage_df.index.max() == datetime.strptime(borders[1], "%Y-%m-%d %H:%M:%S").date() +@pytest.mark.parametrize("stride", [-1, 0]) +def test_invalid_backtest_stride_not_positive(stride, catboost_pipeline: Pipeline, example_tsdf: TSDataset): + """Test Pipeline.backtest behavior on setting not positive stride.""" + with pytest.raises(ValueError, match="Stride should be a positive number, .* given"): + _ = catboost_pipeline.backtest(ts=example_tsdf, n_folds=3, stride=stride, metrics=DEFAULT_METRICS) @pytest.mark.parametrize( @@ -394,7 +376,7 @@ def test_generate_constant_timeranges_hours(): ), ), ) -def test_get_metrics_interface( +def test_backtest_metrics_interface( catboost_pipeline: Pipeline, aggregate_metrics: bool, expected_columns: List[str], big_daily_example_tsdf: TSDataset ): """Check that Pipeline.backtest returns metrics in correct format.""" @@ -406,47 +388,263 @@ def test_get_metrics_interface( assert sorted(expected_columns) == sorted(metrics_df.columns) -def test_get_forecasts_interface_daily(catboost_pipeline: Pipeline, big_daily_example_tsdf: TSDataset): +@pytest.mark.parametrize( + "ts_fixture", + [ + "big_daily_example_tsdf", + "example_tsdf", + ], +) +def test_backtest_forecasts_columns(ts_fixture, catboost_pipeline, request): """Check that Pipeline.backtest returns forecasts in correct format.""" - _, forecast, _ = catboost_pipeline.backtest(ts=big_daily_example_tsdf, metrics=DEFAULT_METRICS) + ts = request.getfixturevalue(ts_fixture) + _, forecast, _ = catboost_pipeline.backtest(ts=ts, metrics=DEFAULT_METRICS) expected_columns = sorted( ["regressor_lag_feature_10", "regressor_lag_feature_11", "regressor_lag_feature_12", "fold_number", "target"] ) assert expected_columns == sorted(set(forecast.columns.get_level_values("feature"))) -def test_get_forecasts_interface_hours(catboost_pipeline: Pipeline, example_tsdf: TSDataset): - """Check that Pipeline.backtest returns forecasts in correct format with non-daily seasonality.""" - _, forecast, _ = catboost_pipeline.backtest(ts=example_tsdf, metrics=DEFAULT_METRICS) - expected_columns = sorted( - ["regressor_lag_feature_10", "regressor_lag_feature_11", "regressor_lag_feature_12", "fold_number", "target"] - ) - assert expected_columns == sorted(set(forecast.columns.get_level_values("feature"))) +@pytest.mark.parametrize( + "n_folds, horizon, expected_timestamps", + [ + (2, 3, [-6, -5, -4, -3, -2, -1]), + (2, 5, [-10, -9, -8, -7, -6, -5, -4, -3, -2, -1]), + ( + [ + FoldMask( + first_train_timestamp=pd.Timestamp("2020-01-01"), + last_train_timestamp=pd.Timestamp("2020-01-31 14:00"), + target_timestamps=[pd.Timestamp("2020-01-31 17:00")], + ), + FoldMask( + first_train_timestamp=pd.Timestamp("2020-01-01"), + last_train_timestamp=pd.Timestamp("2020-01-31 19:00"), + target_timestamps=[pd.Timestamp("2020-01-31 22:00")], + ), + ], + 5, + [-8, -3], + ), + ], +) +def test_backtest_forecasts_timestamps(n_folds, horizon, expected_timestamps, example_tsdf): + """Check that Pipeline.backtest returns forecasts with expected timestamps.""" + pipeline = Pipeline(model=NaiveModel(lag=horizon), horizon=horizon) + _, forecast, _ = pipeline.backtest(ts=example_tsdf, metrics=DEFAULT_METRICS, n_folds=n_folds) + timestamp = example_tsdf.index + np.testing.assert_array_equal(forecast.index, timestamp[expected_timestamps]) -def test_get_fold_info_interface_daily(catboost_pipeline: Pipeline, big_daily_example_tsdf: TSDataset): + +@pytest.mark.parametrize( + "n_folds, horizon, stride, expected_timestamps", + [ + (2, 3, 3, [-6, -5, -4, -3, -2, -1]), + (2, 3, 1, [-4, -3, -2, -3, -2, -1]), + (2, 3, 5, [-8, -7, -6, -3, -2, -1]), + ], +) +def test_backtest_forecasts_timestamps_with_stride(n_folds, horizon, stride, expected_timestamps, example_tsdf): + """Check that Pipeline.backtest with stride returns forecasts with expected timestamps.""" + pipeline = Pipeline(model=NaiveModel(lag=horizon), horizon=horizon) + _, forecast, _ = pipeline.backtest(ts=example_tsdf, metrics=DEFAULT_METRICS, n_folds=n_folds, stride=stride) + timestamp = example_tsdf.index + + np.testing.assert_array_equal(forecast.index, timestamp[expected_timestamps]) + + +@pytest.mark.parametrize( + "ts_fixture, n_folds", + [ + ("big_daily_example_tsdf", 1), + ("big_daily_example_tsdf", 2), + ("example_tsdf", 1), + ("example_tsdf", 2), + ], +) +def test_backtest_fold_info_format(ts_fixture, n_folds, request): """Check that Pipeline.backtest returns info dataframe in correct format.""" - _, _, info_df = catboost_pipeline.backtest(ts=big_daily_example_tsdf, metrics=DEFAULT_METRICS) + ts = request.getfixturevalue(ts_fixture) + pipeline = Pipeline(model=NaiveModel(lag=7), horizon=7) + _, _, info_df = pipeline.backtest(ts=ts, metrics=DEFAULT_METRICS, n_folds=n_folds) + + expected_folds = pd.Series(np.arange(n_folds)) + pd.testing.assert_series_equal(info_df["fold_number"], expected_folds, check_names=False) expected_columns = ["fold_number", "test_end_time", "test_start_time", "train_end_time", "train_start_time"] assert expected_columns == sorted(info_df.columns) -def test_get_fold_info_interface_hours(catboost_pipeline: Pipeline, example_tsdf: TSDataset): - """Check that Pipeline.backtest returns info dataframe in correct format with non-daily seasonality.""" - _, _, info_df = catboost_pipeline.backtest(ts=example_tsdf, metrics=DEFAULT_METRICS) - expected_columns = ["fold_number", "test_end_time", "test_start_time", "train_end_time", "train_start_time"] - assert expected_columns == sorted(info_df.columns) +@pytest.mark.parametrize( + "mode, n_folds, refit, horizon, stride, expected_train_starts, expected_train_ends, expected_test_starts, expected_test_ends", + [ + ("expand", 3, True, 7, None, [0, 0, 0], [-22, -15, -8], [-21, -14, -7], [-15, -8, -1]), + ("expand", 3, True, 7, 1, [0, 0, 0], [-10, -9, -8], [-9, -8, -7], [-3, -2, -1]), + ("expand", 3, True, 7, 10, [0, 0, 0], [-28, -18, -8], [-27, -17, -7], [-21, -11, -1]), + ("expand", 3, False, 7, None, [0, 0, 0], [-22, -22, -22], [-21, -14, -7], [-15, -8, -1]), + ("expand", 3, False, 7, 1, [0, 0, 0], [-10, -10, -10], [-9, -8, -7], [-3, -2, -1]), + ("expand", 3, False, 7, 10, [0, 0, 0], [-28, -28, -28], [-27, -17, -7], [-21, -11, -1]), + ("expand", 1, 1, 7, None, [0], [-8], [-7], [-1]), + ("expand", 1, 2, 7, None, [0], [-8], [-7], [-1]), + ("expand", 3, 1, 7, None, [0, 0, 0], [-22, -15, -8], [-21, -14, -7], [-15, -8, -1]), + ("expand", 3, 2, 7, None, [0, 0, 0], [-22, -22, -8], [-21, -14, -7], [-15, -8, -1]), + ("expand", 3, 3, 7, None, [0, 0, 0], [-22, -22, -22], [-21, -14, -7], [-15, -8, -1]), + ("expand", 3, 4, 7, None, [0, 0, 0], [-22, -22, -22], [-21, -14, -7], [-15, -8, -1]), + ("expand", 4, 1, 7, None, [0, 0, 0, 0], [-29, -22, -15, -8], [-28, -21, -14, -7], [-22, -15, -8, -1]), + ("expand", 4, 2, 7, None, [0, 0, 0, 0], [-29, -29, -15, -15], [-28, -21, -14, -7], [-22, -15, -8, -1]), + ("expand", 4, 2, 7, 1, [0, 0, 0, 0], [-11, -11, -9, -9], [-10, -9, -8, -7], [-4, -3, -2, -1]), + ("expand", 4, 2, 7, 10, [0, 0, 0, 0], [-38, -38, -18, -18], [-37, -27, -17, -7], [-31, -21, -11, -1]), + ("expand", 4, 3, 7, None, [0, 0, 0, 0], [-29, -29, -29, -8], [-28, -21, -14, -7], [-22, -15, -8, -1]), + ("expand", 4, 4, 7, None, [0, 0, 0, 0], [-29, -29, -29, -29], [-28, -21, -14, -7], [-22, -15, -8, -1]), + ("expand", 4, 5, 7, None, [0, 0, 0, 0], [-29, -29, -29, -29], [-28, -21, -14, -7], [-22, -15, -8, -1]), + ("constant", 3, True, 7, None, [0, 7, 14], [-22, -15, -8], [-21, -14, -7], [-15, -8, -1]), + ("constant", 3, True, 7, 1, [0, 1, 2], [-10, -9, -8], [-9, -8, -7], [-3, -2, -1]), + ("constant", 3, True, 7, 10, [0, 10, 20], [-28, -18, -8], [-27, -17, -7], [-21, -11, -1]), + ("constant", 3, False, 7, None, [0, 0, 0], [-22, -22, -22], [-21, -14, -7], [-15, -8, -1]), + ("constant", 3, False, 7, 1, [0, 0, 0], [-10, -10, -10], [-9, -8, -7], [-3, -2, -1]), + ("constant", 3, False, 7, 10, [0, 0, 0], [-28, -28, -28], [-27, -17, -7], [-21, -11, -1]), + ("constant", 1, 1, 7, None, [0], [-8], [-7], [-1]), + ("constant", 1, 2, 7, None, [0], [-8], [-7], [-1]), + ("constant", 3, 1, 7, None, [0, 7, 14], [-22, -15, -8], [-21, -14, -7], [-15, -8, -1]), + ("constant", 3, 2, 7, None, [0, 0, 14], [-22, -22, -8], [-21, -14, -7], [-15, -8, -1]), + ("constant", 3, 3, 7, None, [0, 0, 0], [-22, -22, -22], [-21, -14, -7], [-15, -8, -1]), + ("constant", 3, 4, 7, None, [0, 0, 0], [-22, -22, -22], [-21, -14, -7], [-15, -8, -1]), + ("constant", 4, 1, 7, None, [0, 7, 14, 21], [-29, -22, -15, -8], [-28, -21, -14, -7], [-22, -15, -8, -1]), + ("constant", 4, 2, 7, None, [0, 0, 14, 14], [-29, -29, -15, -15], [-28, -21, -14, -7], [-22, -15, -8, -1]), + ("constant", 4, 2, 7, 1, [0, 0, 2, 2], [-11, -11, -9, -9], [-10, -9, -8, -7], [-4, -3, -2, -1]), + ("constant", 4, 2, 7, 10, [0, 0, 20, 20], [-38, -38, -18, -18], [-37, -27, -17, -7], [-31, -21, -11, -1]), + ("constant", 4, 3, 7, None, [0, 0, 0, 21], [-29, -29, -29, -8], [-28, -21, -14, -7], [-22, -15, -8, -1]), + ("constant", 4, 4, 7, None, [0, 0, 0, 0], [-29, -29, -29, -29], [-28, -21, -14, -7], [-22, -15, -8, -1]), + ("constant", 4, 5, 7, None, [0, 0, 0, 0], [-29, -29, -29, -29], [-28, -21, -14, -7], [-22, -15, -8, -1]), + ( + None, + [ + FoldMask( + first_train_timestamp=None, + last_train_timestamp=pd.Timestamp("2020-01-31 10:00"), + target_timestamps=[pd.Timestamp("2020-01-31 14:00")], + ), + FoldMask( + first_train_timestamp=None, + last_train_timestamp=pd.Timestamp("2020-01-31 17:00"), + target_timestamps=[pd.Timestamp("2020-01-31 21:00")], + ), + ], + True, + 7, + None, + [0, 0], + [-15, -8], + [-14, -7], + [-8, -1], + ), + ( + None, + [ + FoldMask( + first_train_timestamp=pd.Timestamp("2020-01-01 1:00"), + last_train_timestamp=pd.Timestamp("2020-01-31 10:00"), + target_timestamps=[pd.Timestamp("2020-01-31 14:00")], + ), + FoldMask( + first_train_timestamp=pd.Timestamp("2020-01-01 8:00"), + last_train_timestamp=pd.Timestamp("2020-01-31 17:00"), + target_timestamps=[pd.Timestamp("2020-01-31 21:00")], + ), + ], + True, + 7, + None, + [1, 8], + [-15, -8], + [-14, -7], + [-8, -1], + ), + ( + None, + [ + FoldMask( + first_train_timestamp=None, + last_train_timestamp=pd.Timestamp("2020-01-30 20:00"), + target_timestamps=[pd.Timestamp("2020-01-31 00:00")], + ), + FoldMask( + first_train_timestamp=None, + last_train_timestamp=pd.Timestamp("2020-01-31 03:00"), + target_timestamps=[pd.Timestamp("2020-01-31 07:00")], + ), + FoldMask( + first_train_timestamp=None, + last_train_timestamp=pd.Timestamp("2020-01-31 10:00"), + target_timestamps=[pd.Timestamp("2020-01-31 14:00")], + ), + FoldMask( + first_train_timestamp=None, + last_train_timestamp=pd.Timestamp("2020-01-31 17:00"), + target_timestamps=[pd.Timestamp("2020-01-31 21:00")], + ), + ], + 2, + 7, + None, + [0, 0, 0, 0], + [-29, -29, -15, -15], + [-28, -21, -14, -7], + [-22, -15, -8, -1], + ), + ], +) +def test_backtest_fold_info_timestamps( + mode, + n_folds, + refit, + horizon, + stride, + expected_train_starts, + expected_train_ends, + expected_test_starts, + expected_test_ends, + example_tsdf, +): + """Check that Pipeline.backtest returns info dataframe with correct timestamps.""" + pipeline = Pipeline(model=NaiveModel(lag=horizon), horizon=horizon) + _, _, info_df = pipeline.backtest( + ts=example_tsdf, metrics=DEFAULT_METRICS, mode=mode, n_folds=n_folds, refit=refit, stride=stride + ) + timestamp = example_tsdf.index + + np.testing.assert_array_equal(info_df["train_start_time"], timestamp[expected_train_starts]) + np.testing.assert_array_equal(info_df["train_end_time"], timestamp[expected_train_ends]) + np.testing.assert_array_equal(info_df["test_start_time"], timestamp[expected_test_starts]) + np.testing.assert_array_equal(info_df["test_end_time"], timestamp[expected_test_ends]) + + +def test_backtest_refit_success(catboost_pipeline: Pipeline, big_example_tsdf: TSDataset): + """Check that backtest without refit works on pipeline that supports it.""" + _ = catboost_pipeline.backtest(ts=big_example_tsdf, n_jobs=1, metrics=DEFAULT_METRICS, n_folds=3, refit=False) + + +def test_backtest_refit_fail(big_example_tsdf: TSDataset): + """Check that backtest without refit doesn't work on pipeline that doesn't support it.""" + pipeline = Pipeline( + model=NaiveModel(lag=7), + transforms=[DifferencingTransform(in_column="target", inplace=True)], + horizon=7, + ) + with pytest.raises(ValueError, match="Test should go after the train without gaps"): + _ = pipeline.backtest(ts=big_example_tsdf, n_jobs=1, metrics=DEFAULT_METRICS, n_folds=3, refit=False) @pytest.mark.long_1 -def test_backtest_with_n_jobs(catboost_pipeline: Pipeline, big_example_tsdf: TSDataset): +@pytest.mark.parametrize("refit", [True, False, 2]) +def test_backtest_with_n_jobs(refit, catboost_pipeline: Pipeline, big_example_tsdf: TSDataset): """Check that Pipeline.backtest gives the same results in case of single and multiple jobs modes.""" ts1 = deepcopy(big_example_tsdf) ts2 = deepcopy(big_example_tsdf) pipeline_1 = deepcopy(catboost_pipeline) pipeline_2 = deepcopy(catboost_pipeline) - _, forecast_1, _ = pipeline_1.backtest(ts=ts1, n_jobs=1, metrics=DEFAULT_METRICS) - _, forecast_2, _ = pipeline_2.backtest(ts=ts2, n_jobs=3, metrics=DEFAULT_METRICS) + _, forecast_1, _ = pipeline_1.backtest(ts=ts1, n_jobs=1, n_folds=4, metrics=DEFAULT_METRICS, refit=refit) + _, forecast_2, _ = pipeline_2.backtest(ts=ts2, n_jobs=3, n_folds=4, metrics=DEFAULT_METRICS, refit=refit) assert (forecast_1 == forecast_2).all().all() @@ -460,10 +658,10 @@ def test_backtest_forecasts_sanity(step_ts: TSDataset): assert np.all(forecast_df == expected_forecast_df) -def test_forecast_raise_error_if_not_fitted(): - """Test that Pipeline raise error when calling forecast without being fit.""" +def test_forecast_raise_error_if_no_ts(): + """Test that Pipeline raises error when calling forecast without ts.""" pipeline = Pipeline(model=NaiveModel(), horizon=5) - with pytest.raises(ValueError, match="Pipeline is not fitted!"): + with pytest.raises(ValueError, match="There is no ts to forecast!"): _ = pipeline.forecast() @@ -476,11 +674,13 @@ def test_forecast_pipeline_with_nan_at_the_end(df_with_nans_in_tails): @pytest.mark.parametrize( - "n_folds, mode, expected_masks", + "n_folds, horizon, stride, mode, expected_masks", ( ( 2, - "expand", + 3, + 3, + CrossValidationMode.expand, [ FoldMask( first_train_timestamp="2020-01-01", @@ -496,7 +696,45 @@ def test_forecast_pipeline_with_nan_at_the_end(df_with_nans_in_tails): ), ( 2, - "constant", + 3, + 1, + CrossValidationMode.expand, + [ + FoldMask( + first_train_timestamp="2020-01-01", + last_train_timestamp="2020-04-05", + target_timestamps=["2020-04-06", "2020-04-07", "2020-04-08"], + ), + FoldMask( + first_train_timestamp="2020-01-01", + last_train_timestamp="2020-04-06", + target_timestamps=["2020-04-07", "2020-04-08", "2020-04-09"], + ), + ], + ), + ( + 2, + 3, + 5, + CrossValidationMode.expand, + [ + FoldMask( + first_train_timestamp="2020-01-01", + last_train_timestamp="2020-04-01", + target_timestamps=["2020-04-02", "2020-04-03", "2020-04-04"], + ), + FoldMask( + first_train_timestamp="2020-01-01", + last_train_timestamp="2020-04-06", + target_timestamps=["2020-04-07", "2020-04-08", "2020-04-09"], + ), + ], + ), + ( + 2, + 3, + 3, + CrossValidationMode.constant, [ FoldMask( first_train_timestamp="2020-01-01", @@ -510,10 +748,48 @@ def test_forecast_pipeline_with_nan_at_the_end(df_with_nans_in_tails): ), ], ), + ( + 2, + 3, + 1, + CrossValidationMode.constant, + [ + FoldMask( + first_train_timestamp="2020-01-01", + last_train_timestamp="2020-04-05", + target_timestamps=["2020-04-06", "2020-04-07", "2020-04-08"], + ), + FoldMask( + first_train_timestamp="2020-01-02", + last_train_timestamp="2020-04-06", + target_timestamps=["2020-04-07", "2020-04-08", "2020-04-09"], + ), + ], + ), + ( + 2, + 3, + 5, + CrossValidationMode.constant, + [ + FoldMask( + first_train_timestamp="2020-01-01", + last_train_timestamp="2020-04-01", + target_timestamps=["2020-04-02", "2020-04-03", "2020-04-04"], + ), + FoldMask( + first_train_timestamp="2020-01-06", + last_train_timestamp="2020-04-06", + target_timestamps=["2020-04-07", "2020-04-08", "2020-04-09"], + ), + ], + ), ), ) -def test_generate_masks_from_n_folds(example_tsds: TSDataset, n_folds, mode, expected_masks): - masks = Pipeline._generate_masks_from_n_folds(ts=example_tsds, n_folds=n_folds, horizon=3, mode=mode) +def test_generate_masks_from_n_folds(example_tsds: TSDataset, n_folds, horizon, stride, mode, expected_masks): + masks = Pipeline._generate_masks_from_n_folds( + ts=example_tsds, n_folds=n_folds, horizon=horizon, stride=stride, mode=mode + ) for mask, expected_mask in zip(masks, expected_masks): assert mask.first_train_timestamp == expected_mask.first_train_timestamp assert mask.last_train_timestamp == expected_mask.last_train_timestamp @@ -530,7 +806,7 @@ def test_generate_folds_datasets(ts_name, mask, request): """Check _generate_folds_datasets for correct work.""" ts = request.getfixturevalue(ts_name) pipeline = Pipeline(model=NaiveModel(lag=7)) - mask = pipeline._prepare_fold_masks(ts=ts, masks=[mask], mode="constant")[0] + mask = pipeline._prepare_fold_masks(ts=ts, masks=[mask], mode=CrossValidationMode.expand, stride=-1)[0] train, test = list(pipeline._generate_folds_datasets(ts, [mask], 4))[0] assert train.index.min() == np.datetime64(mask.first_train_timestamp) assert train.index.max() == np.datetime64(mask.last_train_timestamp) @@ -548,7 +824,7 @@ def test_generate_folds_datasets_without_first_date(ts_name, mask, request): """Check _generate_folds_datasets for correct work without first date.""" ts = request.getfixturevalue(ts_name) pipeline = Pipeline(model=NaiveModel(lag=7)) - mask = pipeline._prepare_fold_masks(ts=ts, masks=[mask], mode="constant")[0] + mask = pipeline._prepare_fold_masks(ts=ts, masks=[mask], mode=CrossValidationMode.expand, stride=-1)[0] train, test = list(pipeline._generate_folds_datasets(ts, [mask], 4))[0] assert train.index.min() == np.datetime64(ts.index.min()) assert train.index.max() == np.datetime64(mask.last_train_timestamp) @@ -563,17 +839,103 @@ def test_generate_folds_datasets_without_first_date(ts_name, mask, request): (FoldMask("2020-01-01", "2020-01-07", ["2020-01-08", "2020-01-11"]), {"segment_0": 95.5, "segment_1": 5}), ), ) -def test_run_fold(ts_run_fold: TSDataset, mask: FoldMask, expected: Dict[str, List[float]]): - train, test = ts_run_fold.train_test_split( +def test_process_fold_forecast(ts_process_fold_forecast, mask: FoldMask, expected: Dict[str, List[float]]): + train, test = ts_process_fold_forecast.train_test_split( train_start=mask.first_train_timestamp, train_end=mask.last_train_timestamp ) pipeline = Pipeline(model=NaiveModel(lag=5), transforms=[], horizon=4) - fold = pipeline._run_fold(train, test, 1, mask, [MAE()], forecast_params=dict()) + pipeline = pipeline.fit(ts=train) + forecast = pipeline.forecast() + fold = pipeline._process_fold_forecast( + forecast=forecast, train=train, test=test, pipeline=pipeline, fold_number=1, mask=mask, metrics=[MAE()] + ) for seg in fold["metrics"]["MAE"].keys(): assert fold["metrics"]["MAE"][seg] == expected[seg] +def test_make_backtest_fold_groups_refit_true(): + masks = [MagicMock() for _ in range(2)] + obtained_results = Pipeline._make_backtest_fold_groups(masks=masks, refit=True) + expected_results = [ + { + "train_fold_number": 0, + "train_mask": masks[0], + "forecast_fold_numbers": [0], + "forecast_masks": [masks[0]], + }, + { + "train_fold_number": 1, + "train_mask": masks[1], + "forecast_fold_numbers": [1], + "forecast_masks": [masks[1]], + }, + ] + assert obtained_results == expected_results + + +def test_make_backtest_fold_groups_refit_false(): + masks = [MagicMock() for _ in range(2)] + obtained_results = Pipeline._make_backtest_fold_groups(masks=masks, refit=False) + expected_results = [ + { + "train_fold_number": 0, + "train_mask": masks[0], + "forecast_fold_numbers": [0, 1], + "forecast_masks": [masks[0], masks[1]], + } + ] + assert obtained_results == expected_results + + +def test_make_backtest_fold_groups_refit_int(): + masks = [MagicMock() for _ in range(5)] + obtained_results = Pipeline._make_backtest_fold_groups(masks=masks, refit=2) + expected_results = [ + { + "train_fold_number": 0, + "train_mask": masks[0], + "forecast_fold_numbers": [0, 1], + "forecast_masks": [masks[0], masks[1]], + }, + { + "train_fold_number": 2, + "train_mask": masks[2], + "forecast_fold_numbers": [2, 3], + "forecast_masks": [masks[2], masks[3]], + }, + { + "train_fold_number": 4, + "train_mask": masks[4], + "forecast_fold_numbers": [4], + "forecast_masks": [masks[4]], + }, + ] + assert obtained_results == expected_results + + +@pytest.mark.parametrize( + "n_folds, refit, expected_refits", + [ + (1, 1, 1), + (1, 2, 1), + (3, 1, 3), + (3, 2, 2), + (3, 3, 1), + (3, 4, 1), + (4, 1, 4), + (4, 2, 2), + (4, 3, 2), + (4, 4, 1), + (4, 5, 1), + ], +) +def test_make_backtest_fold_groups_length_refit_int(n_folds, refit, expected_refits): + masks = [MagicMock() for _ in range(n_folds)] + obtained_results = Pipeline._make_backtest_fold_groups(masks=masks, refit=refit) + assert len(obtained_results) == expected_refits + + @pytest.mark.parametrize( "lag,expected", ((5, {"segment_0": 76.923077, "segment_1": 90.909091}), (6, {"segment_0": 100, "segment_1": 120})) ) @@ -724,6 +1086,51 @@ def test_predict(model, transforms, example_tsds): assert len(result_df) == len(example_tsds.segments) * num_points +@pytest.mark.parametrize("load_ts", [True, False]) +@pytest.mark.parametrize( + "model, transforms", + [ + ( + CatBoostMultiSegmentModel(iterations=100), + [DateFlagsTransform(), LagTransform(in_column="target", lags=list(range(3, 10)))], + ), + ( + LinearPerSegmentModel(), + [DateFlagsTransform(), LagTransform(in_column="target", lags=list(range(3, 10)))], + ), + (SeasonalMovingAverageModel(window=2, seasonality=7), []), + (SARIMAXModel(), []), + (ProphetModel(), []), + ], +) +def test_save_load(load_ts, model, transforms, example_tsds): + horizon = 3 + pipeline = Pipeline(model=model, transforms=transforms, horizon=horizon) + assert_pipeline_equals_loaded_original(pipeline=pipeline, ts=example_tsds, load_ts=load_ts) + + +@pytest.mark.parametrize( + "model, transforms", + [ + ( + CatBoostMultiSegmentModel(iterations=100), + [DateFlagsTransform(), LagTransform(in_column="target", lags=list(range(3, 10)))], + ), + ( + LinearPerSegmentModel(), + [DateFlagsTransform(), LagTransform(in_column="target", lags=list(range(3, 10)))], + ), + (SeasonalMovingAverageModel(window=2, seasonality=7), []), + (SARIMAXModel(), []), + (ProphetModel(), []), + ], +) +def test_forecast_given_ts(model, transforms, example_tsds): + horizon = 3 + pipeline = Pipeline(model=model, transforms=transforms, horizon=horizon) + assert_pipeline_forecasts_given_ts(pipeline=pipeline, ts=example_tsds, horizon=horizon) + + @pytest.mark.parametrize( "model, transforms", [ @@ -740,7 +1147,7 @@ def test_predict(model, transforms, example_tsds): (ProphetModel(), []), ], ) -def test_save_load(model, transforms, example_tsds): +def test_forecast_given_ts_with_prediction_interval(model, transforms, example_tsds): horizon = 3 pipeline = Pipeline(model=model, transforms=transforms, horizon=horizon) - assert_pipeline_equals_loaded_original(pipeline=pipeline, ts=example_tsds) + assert_pipeline_forecasts_given_ts_with_prediction_intervals(pipeline=pipeline, ts=example_tsds, horizon=horizon) diff --git a/tests/test_pipeline/utils.py b/tests/test_pipeline/utils.py index 506c286d2..84b73e3b7 100644 --- a/tests/test_pipeline/utils.py +++ b/tests/test_pipeline/utils.py @@ -1,6 +1,7 @@ import pathlib import tempfile from copy import deepcopy +from typing import List from typing import Tuple import pandas as pd @@ -9,17 +10,30 @@ from etna.pipeline.base import AbstractPipeline -def get_loaded_pipeline(pipeline: AbstractPipeline, ts: TSDataset) -> AbstractPipeline: +def get_loaded_pipeline(pipeline: AbstractPipeline, ts: TSDataset = None) -> AbstractPipeline: with tempfile.TemporaryDirectory() as dir_path_str: dir_path = pathlib.Path(dir_path_str) path = dir_path.joinpath("dummy.zip") pipeline.save(path) - loaded_pipeline = pipeline.load(path, ts=ts) + if ts is None: + loaded_pipeline = pipeline.load(path) + else: + loaded_pipeline = pipeline.load(path, ts=ts) return loaded_pipeline +def select_segments_subset(ts: TSDataset, segments: List[str]) -> TSDataset: + df = ts.raw_df.loc[:, pd.IndexSlice[segments, :]] + df_exog = ts.df_exog + if df_exog is not None: + df_exog = df_exog.loc[:, pd.IndexSlice[segments, :]] + known_future = ts.known_future + freq = ts.freq + return TSDataset(df=df, df_exog=df_exog, known_future=known_future, freq=freq) + + def assert_pipeline_equals_loaded_original( - pipeline: AbstractPipeline, ts: TSDataset + pipeline: AbstractPipeline, ts: TSDataset, load_ts: bool = True ) -> Tuple[AbstractPipeline, AbstractPipeline]: import torch # TODO: remove after fix at issue-802 @@ -29,10 +43,67 @@ def assert_pipeline_equals_loaded_original( torch.manual_seed(11) forecast_ts_1 = pipeline.forecast() - loaded_pipeline = get_loaded_pipeline(pipeline, ts=initial_ts) - torch.manual_seed(11) - forecast_ts_2 = loaded_pipeline.forecast() + if load_ts: + loaded_pipeline = get_loaded_pipeline(pipeline, ts=initial_ts) + torch.manual_seed(11) + forecast_ts_2 = loaded_pipeline.forecast() + else: + loaded_pipeline = get_loaded_pipeline(pipeline) + torch.manual_seed(11) + forecast_ts_2 = loaded_pipeline.forecast(ts=initial_ts) pd.testing.assert_frame_equal(forecast_ts_1.to_pandas(), forecast_ts_2.to_pandas()) return pipeline, loaded_pipeline + + +def assert_pipeline_forecasts_given_ts(pipeline: AbstractPipeline, ts: TSDataset, horizon: int) -> AbstractPipeline: + fit_ts = deepcopy(ts) + fit_ts.df = fit_ts.df.iloc[:-horizon] + to_forecast_ts = deepcopy(ts) + + pipeline.fit(ts=fit_ts) + forecast_ts = pipeline.forecast(ts=to_forecast_ts) + forecast_df = forecast_ts.to_pandas(flatten=True) + + if ts.has_hierarchy(): + expected_segments = ts.hierarchical_structure.get_level_segments(forecast_ts.current_df_level) + else: + expected_segments = to_forecast_ts.segments + assert forecast_ts.segments == expected_segments + expected_index = pd.date_range( + start=to_forecast_ts.index[-1], periods=horizon + 1, freq=to_forecast_ts.freq, name="timestamp" + )[1:] + pd.testing.assert_index_equal(forecast_ts.index, expected_index) + assert not forecast_df["target"].isna().any() + + return pipeline + + +def assert_pipeline_forecasts_given_ts_with_prediction_intervals( + pipeline: AbstractPipeline, ts: TSDataset, horizon: int, **forecast_params +) -> AbstractPipeline: + fit_ts = deepcopy(ts) + fit_ts.df = fit_ts.df.iloc[:-horizon] + to_forecast_ts = deepcopy(ts) + + pipeline.fit(fit_ts) + forecast_ts = pipeline.forecast( + ts=to_forecast_ts, prediction_interval=True, quantiles=[0.025, 0.975], **forecast_params + ) + forecast_df = forecast_ts.to_pandas(flatten=True) + + if ts.has_hierarchy(): + expected_segments = ts.hierarchical_structure.get_level_segments(forecast_ts.current_df_level) + else: + expected_segments = to_forecast_ts.segments + assert forecast_ts.segments == expected_segments + expected_index = pd.date_range( + start=to_forecast_ts.index[-1], periods=horizon + 1, freq=to_forecast_ts.freq, name="timestamp" + )[1:] + pd.testing.assert_index_equal(forecast_ts.index, expected_index) + assert not forecast_df["target"].isna().any() + assert not forecast_df["target_0.025"].isna().any() + assert not forecast_df["target_0.975"].isna().any() + + return pipeline diff --git a/tests/test_transforms/sklearn/test_interface.py b/tests/test_transforms/sklearn/test_interface.py deleted file mode 100644 index f38d6f098..000000000 --- a/tests/test_transforms/sklearn/test_interface.py +++ /dev/null @@ -1,280 +0,0 @@ -from typing import List - -import numpy as np -import pandas as pd -import pytest - -from etna.datasets import TSDataset -from etna.datasets import generate_const_df -from etna.transforms import BoxCoxTransform -from etna.transforms import MaxAbsScalerTransform -from etna.transforms import MinMaxScalerTransform -from etna.transforms import RobustScalerTransform -from etna.transforms import StandardScalerTransform -from etna.transforms import YeoJohnsonTransform - - -@pytest.fixture -def multicolumn_ts(random_seed): - df = generate_const_df(start_time="2020-01-01", periods=20, freq="D", scale=1.0, n_segments=3) - df["target"] += np.random.uniform(0, 0.1, size=df.shape[0]) - df_exog = df.copy().rename(columns={"target": "exog_1"}) - for i in range(2, 6): - df_exog[f"exog_{i}"] = float(i) + np.random.uniform(0, 0.1, size=df.shape[0]) - - df_formatted = TSDataset.to_dataset(df) - df_exog_formatted = TSDataset.to_dataset(df_exog) - - return TSDataset(df=df_formatted, df_exog=df_exog_formatted, freq="D") - - -def extract_new_features_columns(transformed_df: pd.DataFrame, initial_df: pd.DataFrame) -> List[str]: - """Extract columns from feature level that are present in transformed_df but not present in initial_df.""" - return ( - transformed_df.columns.get_level_values("feature") - .difference(initial_df.columns.get_level_values("feature")) - .unique() - .tolist() - ) - - -@pytest.mark.parametrize( - "transform_constructor", - ( - BoxCoxTransform, - YeoJohnsonTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ), -) -def test_fail_invalid_mode(transform_constructor): - """Test that transform raises error in invalid mode.""" - with pytest.raises(ValueError): - _ = transform_constructor(mode="non_existent") - - -@pytest.mark.parametrize( - "transform_constructor", - ( - BoxCoxTransform, - YeoJohnsonTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ), -) -def test_warning_not_inplace(transform_constructor): - """Test that transform raises warning if inplace is set to True, but out_column is also given.""" - with pytest.warns(UserWarning, match="Transformation will be applied inplace"): - _ = transform_constructor(inplace=True, out_column="new_exog") - - -@pytest.mark.parametrize( - "transform_constructor", - [ - BoxCoxTransform, - YeoJohnsonTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ], -) -@pytest.mark.parametrize( - "in_column", - [ - "exog_1", - ["exog_1", "exog_2"], - ], -) -def test_inplace_no_new_columns(transform_constructor, in_column, multicolumn_ts): - """Test that transform in inplace mode doesn't generate new columns.""" - transform = transform_constructor(in_column=in_column, inplace=True) - initial_df = multicolumn_ts.to_pandas() - transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) - - # check new columns - new_columns = extract_new_features_columns(transformed_df, initial_df) - assert len(new_columns) == 0 - - # check that output columns are input columns - assert transform.out_columns == transform.in_column - - -@pytest.mark.parametrize( - "transform_constructor", - [ - BoxCoxTransform, - YeoJohnsonTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ], -) -@pytest.mark.parametrize( - "in_column", - [ - "exog_1", - ["exog_1", "exog_2"], - ], -) -def test_creating_columns(transform_constructor, in_column, multicolumn_ts): - """Test that transform creates new columns according to out_column parameter.""" - transform = transform_constructor(in_column=in_column, out_column="new_exog", inplace=False) - initial_df = multicolumn_ts.to_pandas() - transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) - - # check new columns - new_columns = set(extract_new_features_columns(transformed_df, initial_df)) - in_column = [in_column] if isinstance(in_column, str) else in_column - expected_columns = {f"new_exog_{column}" for column in in_column} - assert new_columns == expected_columns - - # check that output columns are matching input columns - assert len(transform.in_column) == len(transform.out_columns) - assert all( - [f"new_exog_{column}" == new_column for column, new_column in zip(transform.in_column, transform.out_columns)] - ) - - -@pytest.mark.parametrize( - "transform_constructor", - [ - BoxCoxTransform, - YeoJohnsonTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ], -) -@pytest.mark.parametrize( - "in_column", - [ - "exog_1", - ["exog_1", "exog_2"], - ], -) -def test_generated_column_names(transform_constructor, in_column, multicolumn_ts): - """Test that transform generates names for the columns correctly.""" - transform = transform_constructor(in_column=in_column, out_column=None, inplace=False) - initial_df = multicolumn_ts.to_pandas() - transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) - segments = sorted(multicolumn_ts.segments) - - new_columns = extract_new_features_columns(transformed_df, initial_df) - - # check new columns - for column in new_columns: - # create transform from column - transform_temp = eval(column) - df_temp = transform_temp.fit_transform(multicolumn_ts.to_pandas()) - columns_temp = extract_new_features_columns(df_temp, initial_df) - - # compare column names and column values - assert len(columns_temp) == 1 - column_temp = columns_temp[0] - assert column_temp == column - assert np.all( - df_temp.loc[:, pd.IndexSlice[segments, column_temp]] - == transformed_df.loc[:, pd.IndexSlice[segments, column]] - ) - - # check that output columns are matching input columns - assert len(transform.in_column) == len(transform.out_columns) - # check that name if this input column is present inside name of this output column - assert all([(column in new_column) for column, new_column in zip(transform.in_column, transform.out_columns)]) - - -@pytest.mark.parametrize( - "transform_constructor", - [ - BoxCoxTransform, - YeoJohnsonTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ], -) -def test_all_columns(transform_constructor, multicolumn_ts): - """Test that transform can process all columns using None value for in_column.""" - transform = transform_constructor(in_column=None, out_column=None, inplace=False) - initial_df = multicolumn_ts.df.copy() - transformed_df = transform.fit_transform(multicolumn_ts.df) - - new_columns = extract_new_features_columns(transformed_df, initial_df) - assert len(new_columns) == initial_df.columns.get_level_values("feature").nunique() - - -@pytest.mark.parametrize( - "transform_constructor", - [ - BoxCoxTransform, - YeoJohnsonTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ], -) -@pytest.mark.parametrize( - "in_column", [["exog_1", "exog_2", "exog_3"], ["exog_2", "exog_1", "exog_3"], ["exog_3", "exog_2", "exog_1"]] -) -@pytest.mark.parametrize( - "mode", - [ - "macro", - "per-segment", - ], -) -def test_ordering(transform_constructor, in_column, mode, multicolumn_ts): - """Test that transform don't mix columns between each other.""" - transform = transform_constructor(in_column=in_column, out_column=None, mode=mode, inplace=False) - transforms_one_column = [ - transform_constructor(in_column=column, out_column=None, mode=mode, inplace=False) for column in in_column - ] - - segments = sorted(multicolumn_ts.segments) - transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) - - transformed_dfs_one_column = [] - for transform_one_column in transforms_one_column: - transformed_dfs_one_column.append(transform_one_column.fit_transform(multicolumn_ts.to_pandas())) - - in_to_out_columns = {key: value for key, value in zip(transform.in_column, transform.out_columns)} - for i, column in enumerate(in_column): - # find relevant column name in transformed_df - column_multi = in_to_out_columns[column] - - # find relevant column name in transformed_dfs_one_column[i] - column_single = transforms_one_column[i].out_columns[0] - - df_multi = transformed_df.loc[:, pd.IndexSlice[segments, column_multi]] - df_single = transformed_dfs_one_column[i].loc[:, pd.IndexSlice[segments, column_single]] - assert np.all(df_multi == df_single) diff --git a/tests/test_transforms/sklearn/test_power_transform.py b/tests/test_transforms/sklearn/test_power_transform.py deleted file mode 100644 index ae64fe465..000000000 --- a/tests/test_transforms/sklearn/test_power_transform.py +++ /dev/null @@ -1,109 +0,0 @@ -from typing import Any - -import numpy as np -import numpy.testing as npt -import pandas as pd -import pytest -from sklearn.preprocessing import PowerTransformer - -from etna.datasets import TSDataset -from etna.transforms import AddConstTransform -from etna.transforms.math import BoxCoxTransform -from etna.transforms.math import YeoJohnsonTransform - - -@pytest.fixture -def non_positive_df() -> pd.DataFrame: - df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-07-01", freq="1d")}) - df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-07-01", freq="1d")}) - df_1["segment"] = "Moscow" - df_1["target"] = 0 - df_1["exog"] = -1 - df_2["segment"] = "Omsk" - df_2["target"] = -1 - df_2["exog"] = -7 - classic_df = pd.concat([df_1, df_2], ignore_index=True) - return TSDataset.to_dataset(classic_df) - - -@pytest.fixture -def positive_df() -> pd.DataFrame: - df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-07-01", freq="1d")}) - df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-07-01", freq="1d")}) - generator = np.random.RandomState(seed=1) - df_1["segment"] = "Moscow" - df_1["target"] = np.abs(generator.normal(loc=10, scale=1, size=len(df_1))) + 1 - df_1["exog"] = np.abs(generator.normal(loc=15, scale=1, size=len(df_1))) + 1 - df_2["segment"] = "Omsk" - df_2["target"] = np.abs(generator.normal(loc=20, scale=1, size=len(df_2))) + 1 - df_2["exog"] = np.abs(generator.normal(loc=4, scale=1, size=len(df_2))) + 1 - classic_df = pd.concat([df_1, df_2], ignore_index=True) - return TSDataset.to_dataset(classic_df) - - -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_non_positive_series_behavior(non_positive_df: pd.DataFrame, mode: str): - """Check BoxCoxPreprocessing behavior in case of negative-value series.""" - preprocess = BoxCoxTransform(mode=mode) - with pytest.raises(ValueError): - _ = preprocess.fit_transform(df=non_positive_df) - - -@pytest.mark.parametrize( - "preprocessing_class,method", ((BoxCoxTransform, "box-cox"), (YeoJohnsonTransform, "yeo-johnson")) -) -def test_transform_value_all_columns(positive_df: pd.DataFrame, preprocessing_class: Any, method: str): - """Check the value of transform result for all columns.""" - preprocess_none = preprocessing_class() - preprocess_all = preprocessing_class(in_column=positive_df.columns.get_level_values("feature").unique()) - value_none = preprocess_none.fit_transform(df=positive_df.copy()) - value_all = preprocess_all.fit_transform(df=positive_df.copy()) - true_values = PowerTransformer(method=method).fit_transform(positive_df.values) - npt.assert_array_almost_equal(value_none.values, true_values) - npt.assert_array_almost_equal(value_all.values, true_values) - - -@pytest.mark.parametrize( - "preprocessing_class,method", ((BoxCoxTransform, "box-cox"), (YeoJohnsonTransform, "yeo-johnson")) -) -def test_transform_value_one_column(positive_df: pd.DataFrame, preprocessing_class: Any, method: str): - """Check the value of transform result.""" - preprocess = preprocessing_class(in_column="target") - processed_values = preprocess.fit_transform(df=positive_df.copy()) - target_processed_values = processed_values.loc[:, pd.IndexSlice[:, "target"]].values - rest_processed_values = processed_values.drop("target", axis=1, level="feature").values - untouched_values = positive_df.drop("target", axis=1, level="feature").values - true_values = PowerTransformer(method=method).fit_transform(positive_df.loc[:, pd.IndexSlice[:, "target"]].values) - npt.assert_array_almost_equal(target_processed_values, true_values) - npt.assert_array_almost_equal(rest_processed_values, untouched_values) - - -@pytest.mark.parametrize("preprocessing_class", (BoxCoxTransform, YeoJohnsonTransform)) -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_inverse_transform_all_columns(positive_df: pd.DataFrame, preprocessing_class: Any, mode: str): - """Check that inverse_transform rolls back transform result for all columns.""" - preprocess_none = preprocessing_class(mode=mode) - preprocess_all = preprocessing_class(in_column=positive_df.columns.get_level_values("feature").unique(), mode=mode) - transformed_target_none = preprocess_none.fit_transform(df=positive_df.copy()) - transformed_target_all = preprocess_all.fit_transform(df=positive_df.copy()) - inversed_target_none = preprocess_none.inverse_transform(df=transformed_target_none) - inversed_target_all = preprocess_all.inverse_transform(df=transformed_target_all) - np.testing.assert_array_almost_equal(inversed_target_none.values, positive_df.values) - np.testing.assert_array_almost_equal(inversed_target_all.values, positive_df.values) - - -@pytest.mark.parametrize("preprocessing_class", (BoxCoxTransform, YeoJohnsonTransform)) -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_inverse_transform_one_column(positive_df: pd.DataFrame, preprocessing_class: Any, mode: str): - """Check that inverse_transform rolls back transform result for one column.""" - preprocess = preprocessing_class(in_column="target", mode=mode) - transformed_target = preprocess.fit_transform(df=positive_df.copy()) - inversed_target = preprocess.inverse_transform(df=transformed_target) - np.testing.assert_array_almost_equal(inversed_target.values, positive_df.values) - - -@pytest.mark.parametrize("preprocessing_class", (BoxCoxTransform, YeoJohnsonTransform)) -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_fit_transform_with_nans(preprocessing_class, mode, ts_diff_endings): - preprocess = preprocessing_class(in_column="target", mode=mode) - ts_diff_endings.fit_transform([AddConstTransform(in_column="target", value=100)] + [preprocess]) diff --git a/tests/test_transforms/sklearn/test_scalers_transform.py b/tests/test_transforms/sklearn/test_scalers_transform.py deleted file mode 100644 index 75b0b4f51..000000000 --- a/tests/test_transforms/sklearn/test_scalers_transform.py +++ /dev/null @@ -1,145 +0,0 @@ -from typing import List -from typing import Optional -from typing import Union - -import numpy as np -import numpy.testing as npt -import pandas as pd -import pytest - -from etna.datasets import TSDataset -from etna.transforms import MaxAbsScalerTransform -from etna.transforms import MinMaxScalerTransform -from etna.transforms import RobustScalerTransform -from etna.transforms import StandardScalerTransform -from etna.transforms.math.sklearn import SklearnTransform -from etna.transforms.math.sklearn import TransformMode - - -class DummySkTransform: - def fit(self, X, y=None): # noqa: N803 - pass - - def transform(self, X, y=None): # noqa: N803 - return X - - def inverse_transform(self, X, y=None): # noqa: N803 - return X - - -class DummyTransform(SklearnTransform): - def __init__( - self, - in_column: Optional[Union[str, List[str]]] = None, - inplace: bool = True, - out_column: Optional[str] = None, - mode: Union[TransformMode, str] = "per-segment", - ): - super().__init__( - in_column=in_column, - inplace=inplace, - out_column=out_column, - transformer=DummySkTransform(), - mode=mode, - ) - - -@pytest.fixture -def normal_distributed_df() -> pd.DataFrame: - df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-07-01", freq="1d")}) - df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-07-01", freq="1d")}) - generator = np.random.RandomState(seed=1) - df_1["segment"] = "Moscow" - df_1["target"] = generator.normal(loc=0, scale=10, size=len(df_1)) - df_1["exog"] = generator.normal(loc=2, scale=10, size=len(df_1)) - df_2["segment"] = "Omsk" - df_2["target"] = generator.normal(loc=5, scale=1, size=len(df_2)) - df_2["exog"] = generator.normal(loc=3, scale=1, size=len(df_2)) - classic_df = pd.concat([df_1, df_2], ignore_index=True) - return TSDataset.to_dataset(classic_df) - - -@pytest.mark.parametrize( - "scaler", - ( - DummyTransform(), - StandardScalerTransform(), - RobustScalerTransform(), - MinMaxScalerTransform(), - MaxAbsScalerTransform(), - StandardScalerTransform(with_std=False), - RobustScalerTransform(with_centering=False, with_scaling=False), - MinMaxScalerTransform(feature_range=(5, 10)), - ), -) -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_dummy_inverse_transform_all_columns(normal_distributed_df, scaler, mode): - """Check that `inverse_transform(transform(df)) == df` for all columns.""" - scaler.mode = TransformMode(mode) - feature_df = scaler.fit_transform(df=normal_distributed_df.copy()) - inversed_df = scaler.inverse_transform(df=feature_df.copy()) - npt.assert_array_almost_equal(normal_distributed_df.values, inversed_df.values) - - -@pytest.mark.parametrize( - "scaler", - ( - DummyTransform(in_column="target"), - StandardScalerTransform(in_column="target"), - RobustScalerTransform(in_column="target"), - MinMaxScalerTransform(in_column="target"), - MaxAbsScalerTransform(in_column="target"), - StandardScalerTransform(in_column="target", with_std=False), - RobustScalerTransform(in_column="target", with_centering=False, with_scaling=False), - MinMaxScalerTransform(in_column="target", feature_range=(5, 10)), - ), -) -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_dummy_inverse_transform_one_column(normal_distributed_df, scaler, mode): - """Check that `inverse_transform(transform(df)) == df` for one column.""" - scaler.mode = TransformMode(mode) - feature_df = scaler.fit_transform(df=normal_distributed_df.copy()) - inversed_df = scaler.inverse_transform(df=feature_df) - npt.assert_array_almost_equal(normal_distributed_df.values, inversed_df.values) - - -@pytest.mark.parametrize( - "scaler", - ( - DummyTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ), -) -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_inverse_transform_not_inplace(normal_distributed_df, scaler, mode): - """Check that inversed values the same for not inplace version.""" - not_inplace_scaler = scaler(inplace=False, mode=mode) - columns_to_compare = normal_distributed_df.columns - transformed_df = not_inplace_scaler.fit_transform(df=normal_distributed_df.copy()) - inverse_transformed_df = not_inplace_scaler.inverse_transform(transformed_df) - assert np.all(inverse_transformed_df[columns_to_compare] == normal_distributed_df) - - -@pytest.mark.parametrize( - "scaler", - ( - DummyTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ), -) -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_fit_transform_with_nans(scaler, mode, ts_diff_endings): - preprocess = scaler(in_column="target", mode=mode) - ts_diff_endings.fit_transform([preprocess]) diff --git a/tests/test_transforms/test_encoders/test_mean_segment_encoder_transform.py b/tests/test_transforms/test_encoders/test_mean_segment_encoder_transform.py index 0ac9bced2..a57ac6ea2 100644 --- a/tests/test_transforms/test_encoders/test_mean_segment_encoder_transform.py +++ b/tests/test_transforms/test_encoders/test_mean_segment_encoder_transform.py @@ -9,11 +9,11 @@ from tests.test_transforms.utils import assert_transformation_equals_loaded_original -@pytest.mark.parametrize("expected_global_means", ([[3, 30]])) +@pytest.mark.parametrize("expected_global_means", [{"Moscow": 3, "Omsk": 30}]) def test_mean_segment_encoder_fit(simple_df, expected_global_means): encoder = MeanSegmentEncoderTransform() encoder.fit(simple_df) - assert (encoder.global_means == expected_global_means).all() + assert encoder.global_means == expected_global_means def test_mean_segment_encoder_transform(simple_df, transformed_simple_df): @@ -22,6 +22,38 @@ def test_mean_segment_encoder_transform(simple_df, transformed_simple_df): pd.testing.assert_frame_equal(transformed_df, transformed_simple_df) +def test_subset_segments(simple_df): + train_df = simple_df + test_df = simple_df.loc[:, pd.IndexSlice["Omsk", :]] + transform = MeanSegmentEncoderTransform() + + transform.fit(train_df) + transformed_test_df = transform.transform(test_df) + + segments = sorted(transformed_test_df.columns.get_level_values("segment").unique()) + features = sorted(transformed_test_df.columns.get_level_values("feature").unique()) + assert segments == ["Omsk"] + assert features == ["exog", "segment_mean", "target"] + + +def test_not_fitted_error(simple_df): + encoder = MeanSegmentEncoderTransform() + with pytest.raises(ValueError, match="The transform isn't fitted"): + encoder.transform(simple_df) + + +def test_new_segments_error(simple_df): + train_df = simple_df.loc[:, pd.IndexSlice["Moscow", :]] + test_df = simple_df.loc[:, pd.IndexSlice["Omsk", :]] + transform = MeanSegmentEncoderTransform() + + transform.fit(train_df) + with pytest.raises( + NotImplementedError, match="This transform can't process segments that weren't present on train data" + ): + _ = transform.transform(test_df) + + @pytest.fixture def almost_constant_ts(random_seed) -> TSDataset: df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-07-01", freq="D")}) diff --git a/tests/test_transforms/test_encoders/test_segment_encoder_transform.py b/tests/test_transforms/test_encoders/test_segment_encoder_transform.py index 8a8891bb3..18bb32249 100644 --- a/tests/test_transforms/test_encoders/test_segment_encoder_transform.py +++ b/tests/test_transforms/test_encoders/test_segment_encoder_transform.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import pytest from etna.transforms import SegmentEncoderTransform from tests.test_transforms.utils import assert_transformation_equals_loaded_original @@ -21,6 +22,40 @@ def test_segment_encoder_transform(dummy_df): assert codes == {0, 1}, "Codes are not 0 and 1" +def test_subset_segments(dummy_df): + train_df = dummy_df + test_df = dummy_df.loc[:, pd.IndexSlice["Omsk", :]] + transform = SegmentEncoderTransform() + + transform.fit(train_df) + transformed_test_df = transform.transform(test_df) + + segments = sorted(transformed_test_df.columns.get_level_values("segment").unique()) + features = sorted(transformed_test_df.columns.get_level_values("feature").unique()) + assert segments == ["Omsk"] + assert features == ["segment_code", "target"] + values = transformed_test_df.loc[:, pd.IndexSlice[:, "segment_code"]] + assert np.all(values == values.iloc[0]) + + +def test_not_fitted_error(dummy_df): + encoder = SegmentEncoderTransform() + with pytest.raises(ValueError, match="The transform isn't fitted"): + encoder.transform(dummy_df) + + +def test_new_segments_error(dummy_df): + train_df = dummy_df.loc[:, pd.IndexSlice["Moscow", :]] + test_df = dummy_df.loc[:, pd.IndexSlice["Omsk", :]] + transform = SegmentEncoderTransform() + + transform.fit(train_df) + with pytest.raises( + NotImplementedError, match="This transform can't process segments that weren't present on train data" + ): + _ = transform.transform(test_df) + + def test_save_load(example_tsds): transform = SegmentEncoderTransform() assert_transformation_equals_loaded_original(transform=transform, ts=example_tsds) diff --git a/tests/test_transforms/test_inference/common.py b/tests/test_transforms/test_inference/common.py new file mode 100644 index 000000000..5a38de52d --- /dev/null +++ b/tests/test_transforms/test_inference/common.py @@ -0,0 +1,19 @@ +from typing import Set +from typing import Tuple + +import pandas as pd + + +def find_columns_diff(df_before: pd.DataFrame, df_after: pd.DataFrame) -> Tuple[Set[str], Set[str], Set[str]]: + columns_before_transform = set(df_before.columns) + columns_after_transform = set(df_after.columns) + created_columns = columns_after_transform - columns_before_transform + removed_columns = columns_before_transform - columns_after_transform + + columns_to_check_changes = columns_after_transform.intersection(columns_before_transform) + changed_columns = set() + for column in columns_to_check_changes: + if not df_before[column].equals(df_after[column]): + changed_columns.add(column) + + return created_columns, removed_columns, changed_columns diff --git a/tests/test_transforms/test_inference/conftest.py b/tests/test_transforms/test_inference/conftest.py new file mode 100644 index 000000000..a2295302e --- /dev/null +++ b/tests/test_transforms/test_inference/conftest.py @@ -0,0 +1,154 @@ +import numpy as np +import pandas as pd +import pytest + +from etna.datasets import TSDataset +from etna.datasets import duplicate_data + + +@pytest.fixture +def regular_ts(random_seed) -> TSDataset: + periods = 100 + df_1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df_1["segment"] = "segment_1" + df_1["target"] = np.random.uniform(10, 20, size=periods) + + df_2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df_2["segment"] = "segment_2" + df_2["target"] = np.random.uniform(-15, 5, size=periods) + + df_3 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df_3["segment"] = "segment_3" + df_3["target"] = np.random.uniform(-5, 5, size=periods) + + df = pd.concat([df_1, df_2, df_3]).reset_index(drop=True) + df = TSDataset.to_dataset(df) + tsds = TSDataset(df, freq="D") + + return tsds + + +@pytest.fixture +def ts_with_exog(regular_ts) -> TSDataset: + df = regular_ts.to_pandas(flatten=True) + periods = 200 + timestamp = pd.date_range("2020-01-01", periods=periods) + df_exog_common = pd.DataFrame( + { + "timestamp": timestamp, + "positive": 1, + "weekday": timestamp.weekday, + "monthday": timestamp.day, + "month": timestamp.month, + "year": timestamp.year, + } + ) + df_exog_wide = duplicate_data(df=df_exog_common, segments=regular_ts.segments) + + rng = np.random.default_rng(1) + df_exog_wide.loc[:, pd.IndexSlice["segment_1", "positive"]] = rng.uniform(5, 10, size=periods) + df_exog_wide.loc[:, pd.IndexSlice["segment_2", "positive"]] = rng.uniform(5, 10, size=periods) + df_exog_wide.loc[:, pd.IndexSlice["segment_3", "positive"]] = rng.uniform(5, 10, size=periods) + + ts = TSDataset(df=TSDataset.to_dataset(df).iloc[5:], df_exog=df_exog_wide, freq="D") + return ts + + +@pytest.fixture +def positive_ts() -> TSDataset: + periods = 100 + df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2020-01-01", periods=periods, freq="D")}) + df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2020-01-01", periods=periods, freq="D")}) + df_3 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2020-01-01", periods=periods, freq="D")}) + generator = np.random.RandomState(seed=1) + + df_1["segment"] = "segment_1" + df_1["target"] = np.abs(generator.normal(loc=10, scale=1, size=len(df_1))) + 1 + + df_2["segment"] = "segment_2" + df_2["target"] = np.abs(generator.normal(loc=20, scale=1, size=len(df_2))) + 1 + + df_3["segment"] = "segment_3" + df_3["target"] = np.abs(generator.normal(loc=30, scale=1, size=len(df_2))) + 1 + + classic_df = pd.concat([df_1, df_2, df_3], ignore_index=True) + wide_df = TSDataset.to_dataset(classic_df) + ts = TSDataset(df=wide_df, freq="D") + return ts + + +@pytest.fixture +def ts_to_fill(regular_ts) -> TSDataset: + df = regular_ts.to_pandas() + df.iloc[5, 0] = np.NaN + df.iloc[10, 1] = np.NaN + df.iloc[20, 2] = np.NaN + df.iloc[-5, 0] = np.NaN + df.iloc[-10, 1] = np.NaN + df.iloc[-20, 2] = np.NaN + ts = TSDataset(df=df, freq="D") + return ts + + +@pytest.fixture +def ts_to_resample() -> TSDataset: + df_1 = pd.DataFrame( + { + "timestamp": pd.date_range(start="2020-01-05", freq="H", periods=120), + "segment": "segment_1", + "target": 1, + } + ) + df_2 = pd.DataFrame( + { + "timestamp": pd.date_range(start="2020-01-05", freq="H", periods=120), + "segment": "segment_2", + "target": ([1] + 23 * [0]) * 5, + } + ) + df_3 = pd.DataFrame( + { + "timestamp": pd.date_range(start="2020-01-05", freq="H", periods=120), + "segment": "segment_3", + "target": ([4] + 23 * [0]) * 5, + } + ) + df = pd.concat([df_1, df_2, df_3], ignore_index=True) + + df_exog_1 = pd.DataFrame( + { + "timestamp": pd.date_range(start="2020-01-05", freq="D", periods=8), + "segment": "segment_1", + "regressor_exog": 2, + } + ) + df_exog_2 = pd.DataFrame( + { + "timestamp": pd.date_range(start="2020-01-05", freq="D", periods=8), + "segment": "segment_2", + "regressor_exog": 40, + } + ) + df_exog_3 = pd.DataFrame( + { + "timestamp": pd.date_range(start="2020-01-05", freq="D", periods=8), + "segment": "segment_3", + "regressor_exog": 40, + } + ) + df_exog = pd.concat([df_exog_1, df_exog_2, df_exog_3], ignore_index=True) + ts = TSDataset(df=TSDataset.to_dataset(df), freq="H", df_exog=TSDataset.to_dataset(df_exog), known_future="all") + return ts + + +@pytest.fixture +def ts_with_outliers(regular_ts) -> TSDataset: + df = regular_ts.to_pandas() + df.iloc[5, 0] *= 100 + df.iloc[10, 1] *= 100 + df.iloc[20, 2] *= 100 + df.iloc[-5, 0] *= 100 + df.iloc[-10, 1] *= 100 + df.iloc[-20, 2] *= 100 + ts = TSDataset(df=df, freq="D") + return ts diff --git a/tests/test_transforms/test_inference/test_inverse_transform.py b/tests/test_transforms/test_inference/test_inverse_transform.py new file mode 100644 index 000000000..c85ee8483 --- /dev/null +++ b/tests/test_transforms/test_inference/test_inverse_transform.py @@ -0,0 +1,1859 @@ +from copy import deepcopy + +import pandas as pd +import pytest +from pandas.util.testing import assert_frame_equal +from ruptures import Binseg +from sklearn.linear_model import LinearRegression +from sklearn.tree import DecisionTreeRegressor + +from etna.analysis import StatisticsRelevanceTable +from etna.datasets import TSDataset +from etna.models import ProphetModel +from etna.transforms import AddConstTransform +from etna.transforms import BinsegTrendTransform +from etna.transforms import BoxCoxTransform +from etna.transforms import ChangePointsSegmentationTransform +from etna.transforms import ChangePointsTrendTransform +from etna.transforms import DateFlagsTransform +from etna.transforms import DensityOutliersTransform +from etna.transforms import DifferencingTransform +from etna.transforms import FilterFeaturesTransform +from etna.transforms import FourierTransform +from etna.transforms import GaleShapleyFeatureSelectionTransform +from etna.transforms import HolidayTransform +from etna.transforms import LabelEncoderTransform +from etna.transforms import LagTransform +from etna.transforms import LambdaTransform +from etna.transforms import LinearTrendTransform +from etna.transforms import LogTransform +from etna.transforms import MADTransform +from etna.transforms import MaxAbsScalerTransform +from etna.transforms import MaxTransform +from etna.transforms import MeanSegmentEncoderTransform +from etna.transforms import MeanTransform +from etna.transforms import MedianOutliersTransform +from etna.transforms import MedianTransform +from etna.transforms import MinMaxDifferenceTransform +from etna.transforms import MinMaxScalerTransform +from etna.transforms import MinTransform +from etna.transforms import MRMRFeatureSelectionTransform +from etna.transforms import OneHotEncoderTransform +from etna.transforms import PredictionIntervalOutliersTransform +from etna.transforms import QuantileTransform +from etna.transforms import ResampleWithDistributionTransform +from etna.transforms import RobustScalerTransform +from etna.transforms import SegmentEncoderTransform +from etna.transforms import SpecialDaysTransform +from etna.transforms import StandardScalerTransform +from etna.transforms import StdTransform +from etna.transforms import STLTransform +from etna.transforms import SumTransform +from etna.transforms import TheilSenTrendTransform +from etna.transforms import TimeFlagsTransform +from etna.transforms import TimeSeriesImputerTransform +from etna.transforms import TreeFeatureSelectionTransform +from etna.transforms import TrendTransform +from etna.transforms import YeoJohnsonTransform +from etna.transforms.decomposition import RupturesChangePointsModel +from tests.test_transforms.test_inference.common import find_columns_diff +from tests.utils import select_segments_subset +from tests.utils import to_be_fixed + + +class TestInverseTransformTrainSubsetSegments: + """Test inverse transform on train part of subset of segments. + + Expected that inverse transformation on subset of segments match subset of inverse transformation on full dataset. + """ + + def _test_inverse_transform_train_subset_segments(self, ts, transform, segments): + # select subset of tsdataset + segments = list(set(segments)) + subset_ts = select_segments_subset(ts=deepcopy(ts), segments=segments) + df = ts.to_pandas() + subset_df = subset_ts.to_pandas() + + # fitting + transform.fit(df) + + # transform full + transformed_df = transform.transform(df) + inverse_transformed_df = transform.inverse_transform(transformed_df) + + # transform subset of segments + transformed_subset_df = transform.transform(subset_df) + inverse_transformed_subset_df = transform.inverse_transform(transformed_subset_df) + + # checking + assert_frame_equal(inverse_transformed_subset_df, inverse_transformed_df.loc[:, pd.IndexSlice[segments, :]]) + + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + ), + "regular_ts", + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + ), + (BinsegTrendTransform(in_column="target"), "regular_ts"), + (LinearTrendTransform(in_column="target"), "regular_ts"), + (TheilSenTrendTransform(in_column="target"), "regular_ts"), + (STLTransform(in_column="target", period=7), "regular_ts"), + (TrendTransform(in_column="target"), "regular_ts"), + # encoders + (LabelEncoderTransform(in_column="weekday"), "ts_with_exog"), + (OneHotEncoderTransform(in_column="weekday"), "ts_with_exog"), + (MeanSegmentEncoderTransform(), "regular_ts"), + (SegmentEncoderTransform(), "regular_ts"), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog"), + (GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + (MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), + # math + (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts"), + (LagTransform(in_column="target", lags=[1, 2, 3]), "regular_ts"), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False), + "regular_ts", + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + ), + (LogTransform(in_column="target", inplace=False), "positive_ts"), + (LogTransform(in_column="target", inplace=True), "positive_ts"), + (DifferencingTransform(in_column="target", inplace=False), "regular_ts"), + (DifferencingTransform(in_column="target", inplace=True), "regular_ts"), + (MADTransform(in_column="target", window=7), "regular_ts"), + (MaxTransform(in_column="target", window=7), "regular_ts"), + (MeanTransform(in_column="target", window=7), "regular_ts"), + (MedianTransform(in_column="target", window=7), "regular_ts"), + (MinMaxDifferenceTransform(in_column="target", window=7), "regular_ts"), + (MinTransform(in_column="target", window=7), "regular_ts"), + (QuantileTransform(in_column="target", quantile=0.9, window=7), "regular_ts"), + (StdTransform(in_column="target", window=7), "regular_ts"), + (SumTransform(in_column="target", window=7), "regular_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="macro", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", + distribution_column="target", + inplace=False, + ), + "ts_to_resample", + ), + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + ), + (TimeSeriesImputerTransform(in_column="target"), "ts_to_fill"), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers"), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers"), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"), + # timestamp + (DateFlagsTransform(), "regular_ts"), + (FourierTransform(period=7, order=2), "regular_ts"), + (HolidayTransform(), "regular_ts"), + (SpecialDaysTransform(), "regular_ts"), + (TimeFlagsTransform(), "regular_ts"), + ], + ) + def test_inverse_transform_train_subset_segments(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_train_subset_segments(ts, transform, segments=["segment_2"]) + + +class TestInverseTransformFutureSubsetSegments: + """Test inverse transform on future part of subset of segments. + + Expected that inverse transformation on subset of segments match subset of inverse transformation on full dataset. + """ + + def _test_inverse_transform_future_subset_segments(self, ts, transform, segments, horizon=7): + # select subset of tsdataset + subset_ts = select_segments_subset(ts=deepcopy(ts), segments=segments) + train_df = ts.to_pandas() + ts.transforms = [transform] + subset_ts.transforms = [transform] + + # fitting + transform.fit(train_df) + + # transform full + transformed_future_ts = ts.make_future(future_steps=horizon) + transformed_future_df = transformed_future_ts.to_pandas() + inverse_transformed_future_ts = transform.inverse_transform(transformed_future_df) + + # transform subset of segments + transformed_subset_future_ts = subset_ts.make_future(future_steps=horizon) + transformed_subset_future_df = transformed_subset_future_ts.to_pandas() + inverse_transformed_subset_future_df = transform.inverse_transform(transformed_subset_future_df) + + # checking + assert_frame_equal( + inverse_transformed_subset_future_df, inverse_transformed_future_ts.loc[:, pd.IndexSlice[segments, :]] + ) + + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + ), + "regular_ts", + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + ), + ( + ChangePointsTrendTransform( + in_column="positive", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "ts_with_exog", + ), + (BinsegTrendTransform(in_column="target"), "regular_ts"), + (BinsegTrendTransform(in_column="positive"), "ts_with_exog"), + (LinearTrendTransform(in_column="target"), "regular_ts"), + (LinearTrendTransform(in_column="positive"), "ts_with_exog"), + (TheilSenTrendTransform(in_column="target"), "regular_ts"), + (TheilSenTrendTransform(in_column="positive"), "ts_with_exog"), + (STLTransform(in_column="target", period=7), "regular_ts"), + (STLTransform(in_column="positive", period=7), "ts_with_exog"), + (TrendTransform(in_column="target"), "regular_ts"), + # encoders + (LabelEncoderTransform(in_column="weekday"), "ts_with_exog"), + (OneHotEncoderTransform(in_column="weekday"), "ts_with_exog"), + (MeanSegmentEncoderTransform(), "regular_ts"), + (SegmentEncoderTransform(), "regular_ts"), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog"), + (GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + (MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), + # math + (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts"), + (AddConstTransform(in_column="positive", value=1, inplace=True), "ts_with_exog"), + (LagTransform(in_column="target", lags=[1, 2, 3]), "regular_ts"), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False), + "regular_ts", + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + ), + ( + LambdaTransform( + in_column="positive", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "ts_with_exog", + ), + (LogTransform(in_column="target", inplace=False), "positive_ts"), + (LogTransform(in_column="target", inplace=True), "positive_ts"), + (LogTransform(in_column="positive", inplace=True), "ts_with_exog"), + (DifferencingTransform(in_column="target", inplace=False), "regular_ts"), + (DifferencingTransform(in_column="target", inplace=True), "regular_ts"), + (DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog"), + (MADTransform(in_column="target", window=14), "regular_ts"), + (MaxTransform(in_column="target", window=14), "regular_ts"), + (MeanTransform(in_column="target", window=14), "regular_ts"), + (MedianTransform(in_column="target", window=14), "regular_ts"), + (MinMaxDifferenceTransform(in_column="target", window=14), "regular_ts"), + (MinTransform(in_column="target", window=14), "regular_ts"), + (QuantileTransform(in_column="target", quantile=0.9, window=14), "regular_ts"), + (StdTransform(in_column="target", window=14), "regular_ts"), + (SumTransform(in_column="target", window=14), "regular_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"), + (BoxCoxTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (BoxCoxTransform(in_column="target", mode="macro", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts"), + (BoxCoxTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MaxAbsScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (MaxAbsScalerTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (MinMaxScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (RobustScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (StandardScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", + distribution_column="target", + inplace=False, + ), + "ts_to_resample", + ), + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + ), + (TimeSeriesImputerTransform(in_column="target"), "ts_to_fill"), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers"), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers"), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"), + # timestamp + (DateFlagsTransform(), "regular_ts"), + (FourierTransform(period=7, order=2), "regular_ts"), + (HolidayTransform(), "regular_ts"), + (SpecialDaysTransform(), "regular_ts"), + (TimeFlagsTransform(), "regular_ts"), + ], + ) + def test_inverse_transform_future_subset_segments(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_future_subset_segments(ts, transform, segments=["segment_2"]) + + +class TestInverseTransformTrainNewSegments: + """Test inverse transform on train part of new segments. + + Expected that inverse transformation creates columns, removes columns and reverts values back to original. + """ + + def _test_inverse_transform_train_new_segments(self, ts, transform, train_segments, expected_changes): + # select subset of tsdataset + train_segments = list(set(train_segments)) + forecast_segments = list(set(ts.segments) - set(train_segments)) + train_ts = select_segments_subset(ts=deepcopy(ts), segments=train_segments) + test_ts = select_segments_subset(ts=deepcopy(ts), segments=forecast_segments) + train_df = train_ts.to_pandas() + test_df = test_ts.to_pandas() + + # fitting + transform.fit(train_df) + + # transform + transformed_test_df = transform.transform(test_df.copy()) + + # inverse transform + inverse_transformed_test_df = transform.inverse_transform(transformed_test_df.copy()) + + # checking + expected_columns_to_create = expected_changes.get("create", set()) + expected_columns_to_remove = expected_changes.get("remove", set()) + expected_columns_to_change = expected_changes.get("change", set()) + flat_test_df = TSDataset.to_flatten(test_df) + flat_transformed_test_df = TSDataset.to_flatten(transformed_test_df) + flat_inverse_transformed_test_df = TSDataset.to_flatten(inverse_transformed_test_df) + created_columns, removed_columns, changed_columns = find_columns_diff( + flat_transformed_test_df, flat_inverse_transformed_test_df + ) + + assert created_columns == expected_columns_to_create + assert removed_columns == expected_columns_to_remove + assert changed_columns == expected_columns_to_change + pd.testing.assert_frame_equal(flat_test_df[changed_columns], flat_inverse_transformed_test_df[changed_columns]) + + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + # encoders + (LabelEncoderTransform(in_column="weekday", out_column="res"), "ts_with_exog", {}), + ( + OneHotEncoderTransform(in_column="weekday", out_column="res"), + "ts_with_exog", + {}, + ), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {}), + (FilterFeaturesTransform(exclude=["year"], return_features=True), "ts_with_exog", {"create": {"year"}}), + ( + GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {}, + ), + ( + GaleShapleyFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"year", "weekday", "month"}}, + ), + ( + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {}, + ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"monthday", "positive", "weekday"}}, + ), + ( + TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), + "ts_with_exog", + {}, + ), + ( + TreeFeatureSelectionTransform( + model=DecisionTreeRegressor(random_state=42), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"month", "year", "weekday"}}, + ), + # math + ( + AddConstTransform(in_column="target", value=1, inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts", {"change": {"target"}}), + ( + LagTransform(in_column="target", lags=[1, 2, 3], out_column="res"), + "regular_ts", + {}, + ), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + {"change": {"target"}}, + ), + (LogTransform(in_column="target", inplace=False, out_column="res"), "positive_ts", {}), + (LogTransform(in_column="target", inplace=True), "positive_ts", {"change": {"target"}}), + ( + DifferencingTransform(in_column="target", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (MADTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + (MaxTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + (MeanTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + (MedianTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + ( + MinMaxDifferenceTransform(in_column="target", window=7, out_column="res"), + "regular_ts", + {}, + ), + (MinTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + ( + QuantileTransform(in_column="target", quantile=0.9, window=7, out_column="res"), + "regular_ts", + {}, + ), + (StdTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + (SumTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + ( + BoxCoxTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "positive_ts", + {}, + ), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts", {"change": {"target"}}), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts", {"change": {"target"}}), + # timestamp + ( + DateFlagsTransform(out_column="res"), + "regular_ts", + {}, + ), + ( + FourierTransform(period=7, order=2, out_column="res"), + "regular_ts", + {}, + ), + (HolidayTransform(out_column="res"), "regular_ts", {}), + ( + TimeFlagsTransform(out_column="res"), + "regular_ts", + {}, + ), + ], + ) + def test_inverse_transform_train_new_segments(self, transform, dataset_name, expected_changes, request): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_train_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes=expected_changes + ) + + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + ), + "regular_ts", + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + ), + (BinsegTrendTransform(in_column="target"), "regular_ts"), + (LinearTrendTransform(in_column="target"), "regular_ts"), + (TheilSenTrendTransform(in_column="target"), "regular_ts"), + (STLTransform(in_column="target", period=7), "regular_ts"), + (TrendTransform(in_column="target"), "regular_ts"), + # encoders + (MeanSegmentEncoderTransform(), "regular_ts"), + (SegmentEncoderTransform(), "regular_ts"), + # math + (DifferencingTransform(in_column="target", inplace=True), "regular_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=False + ), + "ts_to_resample", + ), + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + ), + ( + TimeSeriesImputerTransform(in_column="target"), + "ts_to_fill", + ), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers"), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers"), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"), + ], + ) + def test_inverse_transform_train_new_segments_not_implemented(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + with pytest.raises(NotImplementedError): + self._test_inverse_transform_train_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes={} + ) + + @to_be_fixed(raises=NotImplementedError, match="Per-segment transforms can't work on new segments") + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # timestamp + (SpecialDaysTransform(), "regular_ts"), + ], + ) + def test_inverse_transform_train_new_segments_failed_not_implemented(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_train_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes={} + ) + + +class TestInverseTransformFutureNewSegments: + """Test inverse transform on future part of new segments. + + Expected that inverse transformation creates columns, removes columns and reverts values back to original. + """ + + def _test_inverse_transform_future_new_segments(self, ts, transform, train_segments, expected_changes, horizon=7): + # select subset of tsdataset + train_segments = list(set(train_segments)) + forecast_segments = list(set(ts.segments) - set(train_segments)) + train_ts = select_segments_subset(ts=deepcopy(ts), segments=train_segments) + test_ts_without_transform = select_segments_subset(ts=deepcopy(ts), segments=forecast_segments) + test_ts_with_transform = select_segments_subset(ts=deepcopy(ts), segments=forecast_segments) + test_ts_without_transform.transforms = [] + test_ts_with_transform.transforms = [transform] + train_df = train_ts.to_pandas() + + # fitting + transform.fit(train_df) + + # prepare df without transform + test_ts = test_ts_without_transform.make_future(future_steps=horizon) + test_df = test_ts.to_pandas() + + # transform + transformed_test_ts = test_ts_with_transform.make_future(future_steps=horizon) + transformed_test_df = transformed_test_ts.to_pandas() + + # inverse transform + inverse_transformed_test_df = transform.inverse_transform(transformed_test_df) + + # checking + expected_columns_to_create = expected_changes.get("create", set()) + expected_columns_to_remove = expected_changes.get("remove", set()) + expected_columns_to_change = expected_changes.get("change", set()) + flat_test_df = TSDataset.to_flatten(test_df) + flat_transformed_test_df = TSDataset.to_flatten(transformed_test_df) + flat_inverse_transformed_test_df = TSDataset.to_flatten(inverse_transformed_test_df) + created_columns, removed_columns, changed_columns = find_columns_diff( + flat_transformed_test_df, flat_inverse_transformed_test_df + ) + + assert created_columns == expected_columns_to_create + assert removed_columns == expected_columns_to_remove + assert changed_columns == expected_columns_to_change + pd.testing.assert_frame_equal(flat_test_df[changed_columns], flat_inverse_transformed_test_df[changed_columns]) + + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + # encoders + (LabelEncoderTransform(in_column="weekday", out_column="res"), "ts_with_exog", {}), + ( + OneHotEncoderTransform(in_column="weekday", out_column="res"), + "ts_with_exog", + {}, + ), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {}), + ( + GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {}, + ), + ( + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {}, + ), + ( + TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), + "ts_with_exog", + {}, + ), + # math + ( + AddConstTransform(in_column="target", value=1, inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts", {}), + (AddConstTransform(in_column="positive", value=1, inplace=True), "ts_with_exog", {"change": {"positive"}}), + ( + LagTransform(in_column="target", lags=[1, 2, 3], out_column="res"), + "regular_ts", + {}, + ), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + {}, + ), + ( + LambdaTransform( + in_column="positive", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "ts_with_exog", + {"change": {"positive"}}, + ), + (LogTransform(in_column="target", inplace=False, out_column="res"), "positive_ts", {}), + (LogTransform(in_column="target", inplace=True), "positive_ts", {}), + (LogTransform(in_column="positive", inplace=True), "ts_with_exog", {"change": {"positive"}}), + ( + DifferencingTransform(in_column="target", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (MADTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + (MaxTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + (MeanTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + (MedianTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + ( + MinMaxDifferenceTransform(in_column="target", window=14, out_column="res"), + "regular_ts", + {}, + ), + (MinTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + ( + QuantileTransform(in_column="target", quantile=0.9, window=14, out_column="res"), + "regular_ts", + {}, + ), + (StdTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + (SumTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + ( + BoxCoxTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "positive_ts", + {}, + ), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts", {}), + ( + BoxCoxTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + MaxAbsScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + MinMaxScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + RobustScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + StandardScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts", {}), + ( + YeoJohnsonTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + # timestamp + ( + DateFlagsTransform(out_column="res"), + "regular_ts", + {}, + ), + ( + FourierTransform(period=7, order=2, out_column="res"), + "regular_ts", + {}, + ), + (HolidayTransform(out_column="res"), "regular_ts", {}), + ( + TimeFlagsTransform(out_column="res"), + "regular_ts", + {}, + ), + ], + ) + def test_inverse_transform_future_new_segments(self, transform, dataset_name, expected_changes, request): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_future_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes=expected_changes + ) + + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + ), + "regular_ts", + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + ), + (BinsegTrendTransform(in_column="target"), "regular_ts"), + (LinearTrendTransform(in_column="target"), "regular_ts"), + (TheilSenTrendTransform(in_column="target"), "regular_ts"), + (STLTransform(in_column="target", period=7), "regular_ts"), + (TrendTransform(in_column="target"), "regular_ts"), + # encoders + (MeanSegmentEncoderTransform(), "regular_ts"), + (SegmentEncoderTransform(), "regular_ts"), + # math + (DifferencingTransform(in_column="target", inplace=True), "regular_ts"), + (DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"), + (BoxCoxTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MaxAbsScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=False + ), + "ts_to_resample", + ), + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + ), + ( + TimeSeriesImputerTransform(in_column="target"), + "ts_to_fill", + ), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers"), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers"), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"), + ], + ) + def test_inverse_transform_future_new_segments_not_implemented(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + with pytest.raises(NotImplementedError): + self._test_inverse_transform_future_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes={} + ) + + @to_be_fixed(raises=NotImplementedError, match="Per-segment transforms can't work on new segments") + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # timestamp + (SpecialDaysTransform(), "regular_ts"), + ], + ) + def test_inverse_transform_future_new_segments_failed_not_implemented(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_future_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes={} + ) + + @to_be_fixed(raises=Exception) + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + # feature_selection + # TODO: working incorrectly, should fail + (FilterFeaturesTransform(exclude=["year"], return_features=True), "ts_with_exog", {"create": {"year"}}), + ( + GaleShapleyFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"monthday", "positive", "weekday", "year", "month"}}, + ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"positive", "monthday", "weekday"}}, + ), + ( + TreeFeatureSelectionTransform( + model=DecisionTreeRegressor(random_state=42), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"year", "month", "weekday"}}, + ), + ], + ) + def test_inverse_transform_future_new_segments_failed_error( + self, transform, dataset_name, expected_changes, request + ): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_future_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes=expected_changes + ) + + +class TestInverseTransformFutureWithTarget: + """Test inverse transform on future dataset with known target. + + Expected that inverse transformation creates columns, removes columns and reverts values back to original. + """ + + def _test_inverse_transform_future_with_target( + self, ts, transform, expected_changes, gap_size=7, transform_size=50 + ): + # select subset of tsdataset + history_ts, future_full_ts = ts.train_test_split(test_size=gap_size + transform_size) + _, test_ts = future_full_ts.train_test_split(test_size=transform_size) + train_df = history_ts.to_pandas() + test_df = test_ts.to_pandas() + + # fitting + transform.fit(train_df) + + # transform + transformed_test_df = transform.transform(test_df.copy()) + + # inverse transform + inverse_transformed_test_df = transform.inverse_transform(transformed_test_df.copy()) + + # checking + expected_columns_to_create = expected_changes.get("create", set()) + expected_columns_to_remove = expected_changes.get("remove", set()) + expected_columns_to_change = expected_changes.get("change", set()) + flat_test_df = TSDataset.to_flatten(test_df) + flat_transformed_test_df = TSDataset.to_flatten(transformed_test_df) + flat_inverse_transformed_test_df = TSDataset.to_flatten(inverse_transformed_test_df) + created_columns, removed_columns, changed_columns = find_columns_diff( + flat_transformed_test_df, flat_inverse_transformed_test_df + ) + + assert created_columns == expected_columns_to_create + assert removed_columns == expected_columns_to_remove + assert changed_columns == expected_columns_to_change + pd.testing.assert_frame_equal( + flat_test_df[changed_columns], + flat_inverse_transformed_test_df[changed_columns], + ) + + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + out_column="res", + ), + "regular_ts", + {}, + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + {"change": {"target"}}, + ), + (BinsegTrendTransform(in_column="target"), "regular_ts", {"change": {"target"}}), + (LinearTrendTransform(in_column="target"), "regular_ts", {"change": {"target"}}), + (TheilSenTrendTransform(in_column="target"), "regular_ts", {"change": {"target"}}), + (STLTransform(in_column="target", period=7), "regular_ts", {"change": {"target"}}), + (TrendTransform(in_column="target", out_column="res"), "regular_ts", {}), + # encoders + (LabelEncoderTransform(in_column="weekday", out_column="res"), "ts_with_exog", {}), + ( + OneHotEncoderTransform(in_column="weekday", out_column="res"), + "ts_with_exog", + {}, + ), + (MeanSegmentEncoderTransform(), "regular_ts", {}), + (SegmentEncoderTransform(), "regular_ts", {}), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {}), + (FilterFeaturesTransform(exclude=["year"], return_features=True), "ts_with_exog", {"create": {"year"}}), + ( + GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {}, + ), + ( + GaleShapleyFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"month", "year", "positive"}}, + ), + ( + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {}, + ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"weekday", "monthday", "positive"}}, + ), + ( + TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), + "ts_with_exog", + {}, + ), + ( + TreeFeatureSelectionTransform( + model=DecisionTreeRegressor(random_state=42), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"year", "month", "weekday"}}, + ), + # math + ( + AddConstTransform(in_column="target", value=1, inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts", {"change": {"target"}}), + ( + LagTransform(in_column="target", lags=[1, 2, 3], out_column="res"), + "regular_ts", + {}, + ), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + {"change": {"target"}}, + ), + (LogTransform(in_column="target", inplace=False, out_column="res"), "positive_ts", {}), + (LogTransform(in_column="target", inplace=True), "positive_ts", {"change": {"target"}}), + ( + DifferencingTransform(in_column="target", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (MADTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + (MaxTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + (MeanTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + (MedianTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + ( + MinMaxDifferenceTransform(in_column="target", window=7, out_column="res"), + "regular_ts", + {}, + ), + (MinTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + ( + QuantileTransform(in_column="target", quantile=0.9, window=7, out_column="res"), + "regular_ts", + {}, + ), + (StdTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + (SumTransform(in_column="target", window=7, out_column="res"), "regular_ts", {}), + ( + BoxCoxTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "positive_ts", + {}, + ), + ( + BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), + "positive_ts", + {"change": {"target"}}, + ), + ( + BoxCoxTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "positive_ts", + {}, + ), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts", {"change": {"target"}}), + ( + MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + # setting clip=False is important + MinMaxScalerTransform(in_column="target", mode="per-segment", clip=False, inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + # setting clip=False is important + MinMaxScalerTransform(in_column="target", mode="macro", clip=False, inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts", {"change": {"target"}}), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=False, out_column="res" + ), + "ts_to_resample", + {}, + ), + ( + # this behaviour can be unexpected for someone + TimeSeriesImputerTransform(in_column="target"), + "ts_to_fill", + {}, + ), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}), + # timestamp + ( + DateFlagsTransform(out_column="res"), + "regular_ts", + {}, + ), + ( + FourierTransform(period=7, order=2, out_column="res"), + "regular_ts", + {}, + ), + (HolidayTransform(out_column="res"), "regular_ts", {}), + ( + TimeFlagsTransform(out_column="res"), + "regular_ts", + {}, + ), + (SpecialDaysTransform(), "regular_ts", {}), + ], + ) + def test_inverse_transform_future_with_target(self, transform, dataset_name, expected_changes, request): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_future_with_target(ts, transform, expected_changes=expected_changes) + + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + (DifferencingTransform(in_column="target", inplace=True), "regular_ts", {}), + ], + ) + def test_inverse_transform_future_with_target_fail_difference( + self, transform, dataset_name, expected_changes, request + ): + ts = request.getfixturevalue(dataset_name) + with pytest.raises(ValueError, match="Test should go after the train without gaps"): + self._test_inverse_transform_future_with_target(ts, transform, expected_changes=expected_changes) + + # It is the only transform that doesn't change values back during `inverse_transform` + @to_be_fixed(raises=AssertionError) + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + {"change": {"regressor_exog"}}, + ), + ], + ) + def test_inverse_transform_future_with_target_fail_resample( + self, transform, dataset_name, expected_changes, request + ): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_future_with_target(ts, transform, expected_changes=expected_changes) + + +class TestInverseTransformFutureWithoutTarget: + """Test inverse transform on future dataset with unknown target. + + Expected that inverse transformation creates columns, removes columns and reverts values back to original. + """ + + def _test_inverse_transform_future_without_target( + self, ts, transform, expected_changes, gap_size=28, transform_size=7 + ): + # select subset of tsdataset + history_ts, future_ts = ts.train_test_split(test_size=gap_size) + future_ts_without_transform = future_ts + future_ts_with_transform = deepcopy(future_ts) + future_ts_without_transform.transforms = [] + future_ts_with_transform.transforms = [transform] + train_df = history_ts.to_pandas() + + # fitting + transform.fit(train_df) + + # prepare df without transform + test_ts = future_ts_without_transform.make_future(future_steps=transform_size) + test_df = test_ts.to_pandas() + + # transform + transformed_test_ts = future_ts_with_transform.make_future(future_steps=transform_size) + transformed_test_df = transformed_test_ts.to_pandas() + + # inverse transform + inverse_transformed_test_df = transform.inverse_transform(transformed_test_df.copy()) + + # checking + expected_columns_to_create = expected_changes.get("create", set()) + expected_columns_to_remove = expected_changes.get("remove", set()) + expected_columns_to_change = expected_changes.get("change", set()) + flat_test_df = TSDataset.to_flatten(test_df) + flat_transformed_test_df = TSDataset.to_flatten(transformed_test_df) + flat_inverse_transformed_test_df = TSDataset.to_flatten(inverse_transformed_test_df) + created_columns, removed_columns, changed_columns = find_columns_diff( + flat_transformed_test_df, flat_inverse_transformed_test_df + ) + + assert created_columns == expected_columns_to_create + assert removed_columns == expected_columns_to_remove + assert changed_columns == expected_columns_to_change + pd.testing.assert_frame_equal( + flat_test_df[changed_columns], + flat_inverse_transformed_test_df[changed_columns], + ) + + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + out_column="res", + ), + "regular_ts", + {}, + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + {}, + ), + ( + ChangePointsTrendTransform( + in_column="positive", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "ts_with_exog", + {"change": {"positive"}}, + ), + (BinsegTrendTransform(in_column="target"), "regular_ts", {}), + (BinsegTrendTransform(in_column="positive"), "ts_with_exog", {"change": {"positive"}}), + (LinearTrendTransform(in_column="target"), "regular_ts", {}), + (LinearTrendTransform(in_column="positive"), "ts_with_exog", {"change": {"positive"}}), + (TheilSenTrendTransform(in_column="target"), "regular_ts", {}), + (TheilSenTrendTransform(in_column="positive"), "ts_with_exog", {"change": {"positive"}}), + (STLTransform(in_column="target", period=7), "regular_ts", {}), + (STLTransform(in_column="positive", period=7), "ts_with_exog", {"change": {"positive"}}), + (TrendTransform(in_column="target", out_column="res"), "regular_ts", {}), + # encoders + (LabelEncoderTransform(in_column="weekday", out_column="res"), "ts_with_exog", {}), + ( + OneHotEncoderTransform(in_column="weekday", out_column="res"), + "ts_with_exog", + {}, + ), + (MeanSegmentEncoderTransform(), "regular_ts", {}), + (SegmentEncoderTransform(), "regular_ts", {}), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {}), + ( + GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {}, + ), + ( + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {}, + ), + ( + TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), + "ts_with_exog", + {}, + ), + # math + ( + AddConstTransform(in_column="target", value=1, inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts", {}), + (AddConstTransform(in_column="positive", value=1, inplace=True), "ts_with_exog", {"change": {"positive"}}), + ( + LagTransform(in_column="target", lags=[1, 2, 3], out_column="res"), + "regular_ts", + {}, + ), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + {}, + ), + ( + LambdaTransform( + in_column="positive", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "ts_with_exog", + {"change": {"positive"}}, + ), + (LogTransform(in_column="target", inplace=False, out_column="res"), "positive_ts", {}), + (LogTransform(in_column="target", inplace=True), "positive_ts", {}), + (LogTransform(in_column="positive", inplace=True), "ts_with_exog", {"change": {"positive"}}), + ( + DifferencingTransform(in_column="target", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (MADTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + (MaxTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + (MeanTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + (MedianTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + ( + MinMaxDifferenceTransform(in_column="target", window=14, out_column="res"), + "regular_ts", + {}, + ), + (MinTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + ( + QuantileTransform(in_column="target", quantile=0.9, window=14, out_column="res"), + "regular_ts", + {}, + ), + (StdTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + (SumTransform(in_column="target", window=14, out_column="res"), "regular_ts", {}), + ( + BoxCoxTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "positive_ts", + {}, + ), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts", {}), + ( + BoxCoxTransform(in_column="positive", mode="per-segment", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + BoxCoxTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "positive_ts", + {}, + ), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts", {}), + ( + BoxCoxTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts", {}), + ( + MaxAbsScalerTransform(in_column="positive", mode="per-segment", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + MaxAbsScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts", {}), + ( + # setting clip=False is important + MinMaxScalerTransform(in_column="positive", mode="per-segment", clip=False, inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + # setting clip=False is important + MinMaxScalerTransform(in_column="positive", mode="macro", clip=False, inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts", {}), + ( + RobustScalerTransform(in_column="positive", mode="per-segment", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + RobustScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts", {}), + ( + StandardScalerTransform(in_column="positive", mode="per-segment", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + StandardScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts", {}), + ( + YeoJohnsonTransform(in_column="positive", mode="per-segment", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {}, + ), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts", {}), + ( + YeoJohnsonTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=False, out_column="res" + ), + "ts_to_resample", + {}, + ), + ( + # this behaviour can be unexpected for someone + TimeSeriesImputerTransform(in_column="target"), + "ts_to_fill", + {}, + ), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}), + # timestamp + ( + DateFlagsTransform(out_column="res"), + "regular_ts", + {}, + ), + ( + FourierTransform(period=7, order=2, out_column="res"), + "regular_ts", + {}, + ), + (HolidayTransform(out_column="res"), "regular_ts", {}), + ( + TimeFlagsTransform(out_column="res"), + "regular_ts", + {}, + ), + (SpecialDaysTransform(), "regular_ts", {}), + ], + ) + def test_inverse_transform_future_without_target(self, transform, dataset_name, expected_changes, request): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_future_without_target(ts, transform, expected_changes=expected_changes) + + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + (DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog", {"change": {"positive"}}), + (DifferencingTransform(in_column="target", inplace=True), "regular_ts", {}), + ], + ) + def test_inverse_transform_future_without_target_fail_difference( + self, transform, dataset_name, expected_changes, request + ): + ts = request.getfixturevalue(dataset_name) + with pytest.raises(ValueError, match="Test should go after the train without gaps"): + self._test_inverse_transform_future_without_target(ts, transform, expected_changes=expected_changes) + + # It is the only transform that doesn't change values back during `inverse_transform` + @to_be_fixed(AssertionError) + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + {"change": {"regressor_exog"}}, + ), + ], + ) + def test_inverse_transform_future_without_target_fail_resample( + self, transform, dataset_name, expected_changes, request + ): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_future_without_target(ts, transform, expected_changes=expected_changes) + + @to_be_fixed(raises=Exception) + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + # feature_selection + (FilterFeaturesTransform(exclude=["year"], return_features=True), "ts_with_exog", {"create": {"year"}}), + ( + GaleShapleyFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"month", "year", "weekday"}}, + ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"weekday", "monthday", "positive"}}, + ), + ( + TreeFeatureSelectionTransform( + model=DecisionTreeRegressor(random_state=42), top_k=2, return_features=True + ), + "ts_with_exog", + {"create": {"year", "month", "weekday"}}, + ), + ], + ) + def test_inverse_transform_future_without_target_failed_error( + self, transform, dataset_name, expected_changes, request + ): + ts = request.getfixturevalue(dataset_name) + self._test_inverse_transform_future_without_target(ts, transform, expected_changes=expected_changes) diff --git a/tests/test_transforms/test_inference/test_transform.py b/tests/test_transforms/test_inference/test_transform.py new file mode 100644 index 000000000..9ed8ace5f --- /dev/null +++ b/tests/test_transforms/test_inference/test_transform.py @@ -0,0 +1,1644 @@ +from copy import deepcopy + +import pandas as pd +import pytest +from pandas.util.testing import assert_frame_equal +from ruptures import Binseg +from sklearn.linear_model import LinearRegression +from sklearn.tree import DecisionTreeRegressor + +from etna.analysis import StatisticsRelevanceTable +from etna.datasets import TSDataset +from etna.models import ProphetModel +from etna.transforms import AddConstTransform +from etna.transforms import BinsegTrendTransform +from etna.transforms import BoxCoxTransform +from etna.transforms import ChangePointsSegmentationTransform +from etna.transforms import ChangePointsTrendTransform +from etna.transforms import DateFlagsTransform +from etna.transforms import DensityOutliersTransform +from etna.transforms import DifferencingTransform +from etna.transforms import FilterFeaturesTransform +from etna.transforms import FourierTransform +from etna.transforms import GaleShapleyFeatureSelectionTransform +from etna.transforms import HolidayTransform +from etna.transforms import LabelEncoderTransform +from etna.transforms import LagTransform +from etna.transforms import LambdaTransform +from etna.transforms import LinearTrendTransform +from etna.transforms import LogTransform +from etna.transforms import MADTransform +from etna.transforms import MaxAbsScalerTransform +from etna.transforms import MaxTransform +from etna.transforms import MeanSegmentEncoderTransform +from etna.transforms import MeanTransform +from etna.transforms import MedianOutliersTransform +from etna.transforms import MedianTransform +from etna.transforms import MinMaxDifferenceTransform +from etna.transforms import MinMaxScalerTransform +from etna.transforms import MinTransform +from etna.transforms import MRMRFeatureSelectionTransform +from etna.transforms import OneHotEncoderTransform +from etna.transforms import PredictionIntervalOutliersTransform +from etna.transforms import QuantileTransform +from etna.transforms import ResampleWithDistributionTransform +from etna.transforms import RobustScalerTransform +from etna.transforms import SegmentEncoderTransform +from etna.transforms import SpecialDaysTransform +from etna.transforms import StandardScalerTransform +from etna.transforms import StdTransform +from etna.transforms import STLTransform +from etna.transforms import SumTransform +from etna.transforms import TheilSenTrendTransform +from etna.transforms import TimeFlagsTransform +from etna.transforms import TimeSeriesImputerTransform +from etna.transforms import TreeFeatureSelectionTransform +from etna.transforms import TrendTransform +from etna.transforms import YeoJohnsonTransform +from etna.transforms.decomposition import RupturesChangePointsModel +from tests.test_transforms.test_inference.common import find_columns_diff +from tests.utils import select_segments_subset +from tests.utils import to_be_fixed + + +class TestTransformTrainSubsetSegments: + """Test transform on train part of subset of segments. + + Expected that transformation on subset of segments match subset of transformation on full dataset. + """ + + def _test_transform_train_subset_segments(self, ts, transform, segments): + # select subset of tsdataset + segments = list(set(segments)) + subset_ts = select_segments_subset(ts=deepcopy(ts), segments=segments) + df = ts.to_pandas() + subset_df = subset_ts.to_pandas() + + # fitting + transform.fit(df) + + # transform full + transformed_df = transform.transform(df) + + # transform subset of segments + transformed_subset_df = transform.transform(subset_df) + + # checking + assert_frame_equal(transformed_subset_df, transformed_df.loc[:, pd.IndexSlice[segments, :]]) + + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + ), + "regular_ts", + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + ), + (BinsegTrendTransform(in_column="target"), "regular_ts"), + (LinearTrendTransform(in_column="target"), "regular_ts"), + (TheilSenTrendTransform(in_column="target"), "regular_ts"), + (STLTransform(in_column="target", period=7), "regular_ts"), + (TrendTransform(in_column="target"), "regular_ts"), + # encoders + (LabelEncoderTransform(in_column="weekday"), "ts_with_exog"), + (OneHotEncoderTransform(in_column="weekday"), "ts_with_exog"), + (MeanSegmentEncoderTransform(), "regular_ts"), + (SegmentEncoderTransform(), "regular_ts"), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog"), + (GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + (MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), + # math + (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts"), + (LagTransform(in_column="target", lags=[1, 2, 3]), "regular_ts"), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False), + "regular_ts", + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + ), + (LogTransform(in_column="target", inplace=False), "positive_ts"), + (LogTransform(in_column="target", inplace=True), "positive_ts"), + (DifferencingTransform(in_column="target", inplace=False), "regular_ts"), + (DifferencingTransform(in_column="target", inplace=True), "regular_ts"), + (MADTransform(in_column="target", window=7), "regular_ts"), + (MaxTransform(in_column="target", window=7), "regular_ts"), + (MeanTransform(in_column="target", window=7), "regular_ts"), + (MedianTransform(in_column="target", window=7), "regular_ts"), + (MinMaxDifferenceTransform(in_column="target", window=7), "regular_ts"), + (MinTransform(in_column="target", window=7), "regular_ts"), + (QuantileTransform(in_column="target", quantile=0.9, window=7), "regular_ts"), + (StdTransform(in_column="target", window=7), "regular_ts"), + (SumTransform(in_column="target", window=7), "regular_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="macro", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", + distribution_column="target", + inplace=False, + ), + "ts_to_resample", + ), + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + ), + (TimeSeriesImputerTransform(in_column="target"), "ts_to_fill"), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers"), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers"), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"), + # timestamp + (DateFlagsTransform(), "regular_ts"), + (FourierTransform(period=7, order=2), "regular_ts"), + (HolidayTransform(), "regular_ts"), + (SpecialDaysTransform(), "regular_ts"), + (TimeFlagsTransform(), "regular_ts"), + ], + ) + def test_transform_train_subset_segments(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + self._test_transform_train_subset_segments(ts, transform, segments=["segment_2"]) + + +class TestTransformFutureSubsetSegments: + """Test transform on future part of subset of segments. + + Expected that transformation on subset of segments match subset of transformation on full dataset. + """ + + def _test_transform_future_subset_segments(self, ts, transform, segments, horizon=7): + # select subset of tsdataset + subset_ts = select_segments_subset(ts=deepcopy(ts), segments=segments) + train_df = ts.to_pandas() + ts.transforms = [transform] + subset_ts.transforms = [transform] + + # fitting + transform.fit(train_df) + + # transform full + transformed_future_ts = ts.make_future(future_steps=horizon) + + # transform subset of segments + transformed_subset_future_ts = subset_ts.make_future(future_steps=horizon) + + # checking + transformed_future_df = transformed_future_ts.to_pandas() + transformed_subset_future_df = transformed_subset_future_ts.to_pandas() + assert_frame_equal(transformed_subset_future_df, transformed_future_df.loc[:, pd.IndexSlice[segments, :]]) + + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + ), + "regular_ts", + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + ), + ( + ChangePointsTrendTransform( + in_column="positive", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "ts_with_exog", + ), + (BinsegTrendTransform(in_column="target"), "regular_ts"), + (BinsegTrendTransform(in_column="positive"), "ts_with_exog"), + (LinearTrendTransform(in_column="target"), "regular_ts"), + (LinearTrendTransform(in_column="positive"), "ts_with_exog"), + (TheilSenTrendTransform(in_column="target"), "regular_ts"), + (TheilSenTrendTransform(in_column="positive"), "ts_with_exog"), + (STLTransform(in_column="target", period=7), "regular_ts"), + (STLTransform(in_column="positive", period=7), "ts_with_exog"), + (TrendTransform(in_column="target"), "regular_ts"), + # encoders + (LabelEncoderTransform(in_column="weekday"), "ts_with_exog"), + (OneHotEncoderTransform(in_column="weekday"), "ts_with_exog"), + (MeanSegmentEncoderTransform(), "regular_ts"), + (SegmentEncoderTransform(), "regular_ts"), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog"), + (GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + (MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), + # math + (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts"), + (AddConstTransform(in_column="positive", value=1, inplace=True), "ts_with_exog"), + (LagTransform(in_column="target", lags=[1, 2, 3]), "regular_ts"), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False), + "regular_ts", + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + ), + ( + LambdaTransform( + in_column="positive", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "ts_with_exog", + ), + (LogTransform(in_column="target", inplace=False), "positive_ts"), + (LogTransform(in_column="target", inplace=True), "positive_ts"), + (LogTransform(in_column="positive", inplace=True), "ts_with_exog"), + (DifferencingTransform(in_column="target", inplace=False), "regular_ts"), + (DifferencingTransform(in_column="target", inplace=True), "regular_ts"), + (DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog"), + (MADTransform(in_column="target", window=14), "regular_ts"), + (MaxTransform(in_column="target", window=14), "regular_ts"), + (MeanTransform(in_column="target", window=14), "regular_ts"), + (MedianTransform(in_column="target", window=14), "regular_ts"), + (MinMaxDifferenceTransform(in_column="target", window=14), "regular_ts"), + (MinTransform(in_column="target", window=14), "regular_ts"), + (QuantileTransform(in_column="target", quantile=0.9, window=14), "regular_ts"), + (StdTransform(in_column="target", window=14), "regular_ts"), + (SumTransform(in_column="target", window=14), "regular_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"), + (BoxCoxTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (BoxCoxTransform(in_column="target", mode="macro", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts"), + (BoxCoxTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MaxAbsScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (MaxAbsScalerTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (MinMaxScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (RobustScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (StandardScalerTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="positive", mode="macro", inplace=True), "ts_with_exog"), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", + distribution_column="target", + inplace=False, + ), + "ts_to_resample", + ), + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + ), + (TimeSeriesImputerTransform(in_column="target"), "ts_to_fill"), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers"), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers"), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"), + # timestamp + (DateFlagsTransform(), "regular_ts"), + (FourierTransform(period=7, order=2), "regular_ts"), + (HolidayTransform(), "regular_ts"), + (SpecialDaysTransform(), "regular_ts"), + (TimeFlagsTransform(), "regular_ts"), + ], + ) + def test_transform_future_subset_segments(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + self._test_transform_future_subset_segments(ts, transform, segments=["segment_2"]) + + +class TestTransformTrainNewSegments: + """Test transform on train part of new segments. + + Expected that transformation creates columns, removes columns and changes values. + """ + + def _test_transform_train_new_segments(self, ts, transform, train_segments, expected_changes): + # select subset of tsdataset + train_segments = list(set(train_segments)) + forecast_segments = list(set(ts.segments) - set(train_segments)) + train_ts = select_segments_subset(ts=deepcopy(ts), segments=train_segments) + test_ts = select_segments_subset(ts=deepcopy(ts), segments=forecast_segments) + train_df = train_ts.to_pandas() + test_df = test_ts.to_pandas() + + # fitting + transform.fit(train_df) + + # transform + transformed_test_df = transform.transform(test_df.copy()) + + # checking + expected_columns_to_create = expected_changes.get("create", set()) + expected_columns_to_remove = expected_changes.get("remove", set()) + expected_columns_to_change = expected_changes.get("change", set()) + flat_test_df = TSDataset.to_flatten(test_df) + flat_transformed_test_df = TSDataset.to_flatten(transformed_test_df) + created_columns, removed_columns, changed_columns = find_columns_diff(flat_test_df, flat_transformed_test_df) + + assert created_columns == expected_columns_to_create + assert removed_columns == expected_columns_to_remove + assert changed_columns == expected_columns_to_change + + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + # encoders + (LabelEncoderTransform(in_column="weekday", out_column="res"), "ts_with_exog", {"create": {"res"}}), + ( + OneHotEncoderTransform(in_column="weekday", out_column="res"), + "ts_with_exog", + {"create": {"res_0", "res_1", "res_2", "res_3", "res_4", "res_5", "res_6"}}, + ), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {"remove": {"year"}}), + ( + GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {"remove": {"weekday", "year", "month"}}, + ), + ( + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {"remove": {"weekday", "monthday", "positive"}}, + ), + ( + TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), + "ts_with_exog", + {"remove": {"year", "month", "weekday"}}, + ), + # math + ( + AddConstTransform(in_column="target", value=1, inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts", {"change": {"target"}}), + ( + LagTransform(in_column="target", lags=[1, 2, 3], out_column="res"), + "regular_ts", + {"create": {"res_1", "res_2", "res_3"}}, + ), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + {"change": {"target"}}, + ), + (LogTransform(in_column="target", inplace=False, out_column="res"), "positive_ts", {"create": {"res"}}), + (LogTransform(in_column="target", inplace=True), "positive_ts", {"change": {"target"}}), + ( + DifferencingTransform(in_column="target", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (MADTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + (MaxTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + (MeanTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + (MedianTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + MinMaxDifferenceTransform(in_column="target", window=7, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (MinTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + QuantileTransform(in_column="target", quantile=0.9, window=7, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (StdTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + (SumTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + BoxCoxTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "positive_ts", + {"create": {"res_target"}}, + ), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts", {"change": {"target"}}), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts", {"change": {"target"}}), + # timestamp + ( + DateFlagsTransform(out_column="res"), + "regular_ts", + {"create": {"res_day_number_in_week", "res_day_number_in_month", "res_is_weekend"}}, + ), + ( + FourierTransform(period=7, order=2, out_column="res"), + "regular_ts", + {"create": {"res_1", "res_2", "res_3", "res_4"}}, + ), + (HolidayTransform(out_column="res"), "regular_ts", {"create": {"res"}}), + ( + TimeFlagsTransform(out_column="res"), + "regular_ts", + {"create": {"res_minute_in_hour_number", "res_hour_number"}}, + ), + ], + ) + def test_transform_train_new_segments(self, transform, dataset_name, expected_changes, request): + ts = request.getfixturevalue(dataset_name) + self._test_transform_train_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes=expected_changes + ) + + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + ), + "regular_ts", + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + ), + (BinsegTrendTransform(in_column="target"), "regular_ts"), + (LinearTrendTransform(in_column="target"), "regular_ts"), + (TheilSenTrendTransform(in_column="target"), "regular_ts"), + (STLTransform(in_column="target", period=7), "regular_ts"), + (TrendTransform(in_column="target"), "regular_ts"), + # encoders + (MeanSegmentEncoderTransform(), "regular_ts"), + (SegmentEncoderTransform(), "regular_ts"), + # math + (DifferencingTransform(in_column="target", inplace=True), "regular_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=False + ), + "ts_to_resample", + ), + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + ), + ( + TimeSeriesImputerTransform(in_column="target"), + "ts_to_fill", + ), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers"), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers"), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"), + ], + ) + def test_transform_train_new_segments_not_implemented(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + with pytest.raises(NotImplementedError): + self._test_transform_train_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes={} + ) + + @to_be_fixed(raises=NotImplementedError, match="Per-segment transforms can't work on new segments") + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # timestamp + (SpecialDaysTransform(), "regular_ts"), + ], + ) + def test_transform_train_new_segments_failed_not_implemented(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + self._test_transform_train_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes={} + ) + + +class TestTransformFutureNewSegments: + """Test transform on future part of new segments. + + Expected that transformation creates columns, removes columns and changes values. + """ + + def _test_transform_future_new_segments(self, ts, transform, train_segments, expected_changes, horizon=7): + # select subset of tsdataset + train_segments = list(set(train_segments)) + forecast_segments = list(set(ts.segments) - set(train_segments)) + train_ts = select_segments_subset(ts=deepcopy(ts), segments=train_segments) + test_ts_without_transform = select_segments_subset(ts=deepcopy(ts), segments=forecast_segments) + test_ts_with_transform = select_segments_subset(ts=deepcopy(ts), segments=forecast_segments) + test_ts_without_transform.transforms = [] + test_ts_with_transform.transforms = [transform] + train_df = train_ts.to_pandas() + + # fitting + transform.fit(train_df) + + # prepare df without transform + test_ts = test_ts_without_transform.make_future(future_steps=horizon) + test_df = test_ts.to_pandas() + + # transform + transformed_test_ts = test_ts_with_transform.make_future(future_steps=horizon) + transformed_test_df = transformed_test_ts.to_pandas() + + # checking + expected_columns_to_create = expected_changes.get("create", set()) + expected_columns_to_remove = expected_changes.get("remove", set()) + expected_columns_to_change = expected_changes.get("change", set()) + flat_test_df = TSDataset.to_flatten(test_df) + flat_transformed_test_df = TSDataset.to_flatten(transformed_test_df) + created_columns, removed_columns, changed_columns = find_columns_diff(flat_test_df, flat_transformed_test_df) + + assert created_columns == expected_columns_to_create + assert removed_columns == expected_columns_to_remove + assert changed_columns == expected_columns_to_change + + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + # encoders + (LabelEncoderTransform(in_column="weekday", out_column="res"), "ts_with_exog", {"create": {"res"}}), + ( + OneHotEncoderTransform(in_column="weekday", out_column="res"), + "ts_with_exog", + {"create": {"res_0", "res_1", "res_2", "res_3", "res_4", "res_5", "res_6"}}, + ), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {"remove": {"year"}}), + ( + GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {"remove": {"weekday", "year", "month"}}, + ), + ( + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {"remove": {"weekday", "monthday", "positive"}}, + ), + ( + TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), + "ts_with_exog", + {"remove": {"year", "month", "weekday"}}, + ), + # math + ( + AddConstTransform(in_column="target", value=1, inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts", {}), + (AddConstTransform(in_column="positive", value=1, inplace=True), "ts_with_exog", {"change": {"positive"}}), + ( + LagTransform(in_column="target", lags=[1, 2, 3], out_column="res"), + "regular_ts", + {"create": {"res_1", "res_2", "res_3"}}, + ), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + {}, + ), + ( + LambdaTransform( + in_column="positive", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "ts_with_exog", + {"change": {"positive"}}, + ), + (LogTransform(in_column="target", inplace=False, out_column="res"), "positive_ts", {"create": {"res"}}), + (LogTransform(in_column="target", inplace=True), "positive_ts", {}), + (LogTransform(in_column="positive", inplace=True), "ts_with_exog", {"change": {"positive"}}), + ( + DifferencingTransform(in_column="target", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (MADTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + (MaxTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + (MeanTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + (MedianTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + MinMaxDifferenceTransform(in_column="target", window=14, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (MinTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + QuantileTransform(in_column="target", quantile=0.9, window=14, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (StdTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + (SumTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + BoxCoxTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "positive_ts", + {"create": {"res_target"}}, + ), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts", {}), + ( + BoxCoxTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + MaxAbsScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + MinMaxScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + RobustScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + StandardScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts", {}), + ( + YeoJohnsonTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + # timestamp + ( + DateFlagsTransform(out_column="res"), + "regular_ts", + {"create": {"res_day_number_in_week", "res_day_number_in_month", "res_is_weekend"}}, + ), + ( + FourierTransform(period=7, order=2, out_column="res"), + "regular_ts", + {"create": {"res_1", "res_2", "res_3", "res_4"}}, + ), + (HolidayTransform(out_column="res"), "regular_ts", {"create": {"res"}}), + ( + TimeFlagsTransform(out_column="res"), + "regular_ts", + {"create": {"res_minute_in_hour_number", "res_hour_number"}}, + ), + ], + ) + def test_transform_future_new_segments(self, transform, dataset_name, expected_changes, request): + ts = request.getfixturevalue(dataset_name) + self._test_transform_future_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes=expected_changes + ) + + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + ), + "regular_ts", + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + ), + (BinsegTrendTransform(in_column="target"), "regular_ts"), + (LinearTrendTransform(in_column="target"), "regular_ts"), + (TheilSenTrendTransform(in_column="target"), "regular_ts"), + (STLTransform(in_column="target", period=7), "regular_ts"), + (TrendTransform(in_column="target"), "regular_ts"), + # encoders + (MeanSegmentEncoderTransform(), "regular_ts"), + (SegmentEncoderTransform(), "regular_ts"), + # math + (DifferencingTransform(in_column="target", inplace=True), "regular_ts"), + (DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=False), "positive_ts"), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts"), + (BoxCoxTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MaxAbsScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (MinMaxScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (RobustScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (StandardScalerTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False), "regular_ts"), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts"), + (YeoJohnsonTransform(in_column="positive", mode="per-segment", inplace=True), "ts_with_exog"), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=False + ), + "ts_to_resample", + ), + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + ), + ( + TimeSeriesImputerTransform(in_column="target"), + "ts_to_fill", + ), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers"), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers"), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers"), + ], + ) + def test_transform_future_new_segments_not_implemented(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + with pytest.raises(NotImplementedError): + self._test_transform_future_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes={} + ) + + @to_be_fixed(raises=NotImplementedError, match="Per-segment transforms can't work on new segments") + @pytest.mark.parametrize( + "transform, dataset_name", + [ + # timestamp + (SpecialDaysTransform(), "regular_ts"), + ], + ) + def test_transform_future_new_segments_failed_not_implemented(self, transform, dataset_name, request): + ts = request.getfixturevalue(dataset_name) + self._test_transform_future_new_segments( + ts, transform, train_segments=["segment_1", "segment_2"], expected_changes={} + ) + + +class TestTransformFutureWithTarget: + """Test transform on future dataset with known target. + + Expected that transformation creates columns, removes columns and changes values. + """ + + def _test_transform_future_with_target(self, ts, transform, expected_changes, gap_size=7, transform_size=50): + # select subset of tsdataset + history_ts, future_full_ts = ts.train_test_split(test_size=gap_size + transform_size) + _, test_ts = future_full_ts.train_test_split(test_size=transform_size) + train_df = history_ts.to_pandas() + test_df = test_ts.to_pandas() + + # fitting + transform.fit(train_df) + + # transform + transformed_test_df = transform.transform(test_df.copy()) + + # checking + expected_columns_to_create = expected_changes.get("create", set()) + expected_columns_to_remove = expected_changes.get("remove", set()) + expected_columns_to_change = expected_changes.get("change", set()) + flat_test_df = TSDataset.to_flatten(test_df) + flat_transformed_test_df = TSDataset.to_flatten(transformed_test_df) + created_columns, removed_columns, changed_columns = find_columns_diff(flat_test_df, flat_transformed_test_df) + + assert created_columns == expected_columns_to_create + assert removed_columns == expected_columns_to_remove + assert changed_columns == expected_columns_to_change + + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + out_column="res", + ), + "regular_ts", + {"create": {"res"}}, + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + {"change": {"target"}}, + ), + (BinsegTrendTransform(in_column="target"), "regular_ts", {"change": {"target"}}), + (LinearTrendTransform(in_column="target"), "regular_ts", {"change": {"target"}}), + (TheilSenTrendTransform(in_column="target"), "regular_ts", {"change": {"target"}}), + (STLTransform(in_column="target", period=7), "regular_ts", {"change": {"target"}}), + (TrendTransform(in_column="target", out_column="res"), "regular_ts", {"create": {"res"}}), + # encoders + (LabelEncoderTransform(in_column="weekday", out_column="res"), "ts_with_exog", {"create": {"res"}}), + ( + OneHotEncoderTransform(in_column="weekday", out_column="res"), + "ts_with_exog", + {"create": {"res_0", "res_1", "res_2", "res_3", "res_4", "res_5", "res_6"}}, + ), + (MeanSegmentEncoderTransform(), "regular_ts", {"create": {"segment_mean"}}), + (SegmentEncoderTransform(), "regular_ts", {"create": {"segment_code"}}), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {"remove": {"year"}}), + ( + GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {"remove": {"month", "year", "positive"}}, + ), + ( + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {"remove": {"weekday", "monthday", "positive"}}, + ), + ( + TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), + "ts_with_exog", + {"remove": {"year", "month", "weekday"}}, + ), + # math + ( + AddConstTransform(in_column="target", value=1, inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts", {"change": {"target"}}), + ( + LagTransform(in_column="target", lags=[1, 2, 3], out_column="res"), + "regular_ts", + {"create": {"res_1", "res_2", "res_3"}}, + ), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + {"change": {"target"}}, + ), + (LogTransform(in_column="target", inplace=False, out_column="res"), "positive_ts", {"create": {"res"}}), + (LogTransform(in_column="target", inplace=True), "positive_ts", {"change": {"target"}}), + ( + DifferencingTransform(in_column="target", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (DifferencingTransform(in_column="target", inplace=True), "regular_ts", {"change": {"target"}}), + (MADTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + (MaxTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + (MeanTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + (MedianTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + MinMaxDifferenceTransform(in_column="target", window=7, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (MinTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + QuantileTransform(in_column="target", quantile=0.9, window=7, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (StdTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + (SumTransform(in_column="target", window=7, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + BoxCoxTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "positive_ts", + {"create": {"res_target"}}, + ), + ( + BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), + "positive_ts", + {"change": {"target"}}, + ), + ( + BoxCoxTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "positive_ts", + {"create": {"res_target"}}, + ), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts", {"change": {"target"}}), + ( + MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), + "regular_ts", + {"change": {"target"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts", {"change": {"target"}}), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=False, out_column="res" + ), + "ts_to_resample", + {"create": {"res"}}, + ), + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + {"change": {"regressor_exog"}}, + ), + ( + # this behaviour can be unexpected for someone + TimeSeriesImputerTransform(in_column="target"), + "ts_to_fill", + {}, + ), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}), + # timestamp + ( + DateFlagsTransform(out_column="res"), + "regular_ts", + {"create": {"res_day_number_in_week", "res_day_number_in_month", "res_is_weekend"}}, + ), + ( + FourierTransform(period=7, order=2, out_column="res"), + "regular_ts", + {"create": {"res_1", "res_2", "res_3", "res_4"}}, + ), + (HolidayTransform(out_column="res"), "regular_ts", {"create": {"res"}}), + ( + TimeFlagsTransform(out_column="res"), + "regular_ts", + {"create": {"res_minute_in_hour_number", "res_hour_number"}}, + ), + (SpecialDaysTransform(), "regular_ts", {"create": {"anomaly_weekdays", "anomaly_monthdays"}}), + ], + ) + def test_transform_future_with_target(self, transform, dataset_name, expected_changes, request): + ts = request.getfixturevalue(dataset_name) + self._test_transform_future_with_target(ts, transform, expected_changes=expected_changes) + + +class TestTransformFutureWithoutTarget: + """Test transform on future dataset with unknown target. + + Expected that transformation creates columns, removes columns and changes values. + """ + + def _test_transform_future_without_target(self, ts, transform, expected_changes, gap_size=28, transform_size=7): + # select subset of tsdataset + history_ts, future_ts = ts.train_test_split(test_size=gap_size) + future_ts_without_transform = future_ts + future_ts_with_transform = deepcopy(future_ts) + future_ts_without_transform.transforms = [] + future_ts_with_transform.transforms = [transform] + train_df = history_ts.to_pandas() + + # fitting + transform.fit(train_df) + + # prepare df without transform + test_ts = future_ts_without_transform.make_future(future_steps=transform_size) + test_df = test_ts.to_pandas() + + # transform + transformed_test_ts = future_ts_with_transform.make_future(future_steps=transform_size) + transformed_test_df = transformed_test_ts.to_pandas() + + # checking + expected_columns_to_create = expected_changes.get("create", set()) + expected_columns_to_remove = expected_changes.get("remove", set()) + expected_columns_to_change = expected_changes.get("change", set()) + flat_test_df = TSDataset.to_flatten(test_df) + flat_transformed_test_df = TSDataset.to_flatten(transformed_test_df) + created_columns, removed_columns, changed_columns = find_columns_diff(flat_test_df, flat_transformed_test_df) + + assert created_columns == expected_columns_to_create + assert removed_columns == expected_columns_to_remove + assert changed_columns == expected_columns_to_change + + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + # decomposition + ( + ChangePointsSegmentationTransform( + in_column="target", + change_point_model=RupturesChangePointsModel(change_point_model=Binseg(), n_bkps=5), + out_column="res", + ), + "regular_ts", + {"create": {"res"}}, + ), + ( + ChangePointsTrendTransform( + in_column="target", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "regular_ts", + {}, + ), + ( + ChangePointsTrendTransform( + in_column="positive", change_point_model=Binseg(), detrend_model=LinearRegression(), n_bkps=5 + ), + "ts_with_exog", + {"change": {"positive"}}, + ), + (BinsegTrendTransform(in_column="target"), "regular_ts", {}), + (BinsegTrendTransform(in_column="positive"), "ts_with_exog", {"change": {"positive"}}), + (LinearTrendTransform(in_column="target"), "regular_ts", {}), + (LinearTrendTransform(in_column="positive"), "ts_with_exog", {"change": {"positive"}}), + (TheilSenTrendTransform(in_column="target"), "regular_ts", {}), + (TheilSenTrendTransform(in_column="positive"), "ts_with_exog", {"change": {"positive"}}), + (STLTransform(in_column="target", period=7), "regular_ts", {}), + (STLTransform(in_column="positive", period=7), "ts_with_exog", {"change": {"positive"}}), + (TrendTransform(in_column="target", out_column="res"), "regular_ts", {"create": {"res"}}), + # encoders + (LabelEncoderTransform(in_column="weekday", out_column="res"), "ts_with_exog", {"create": {"res"}}), + ( + OneHotEncoderTransform(in_column="weekday", out_column="res"), + "ts_with_exog", + {"create": {"res_0", "res_1", "res_2", "res_3", "res_4", "res_5", "res_6"}}, + ), + (MeanSegmentEncoderTransform(), "regular_ts", {"create": {"segment_mean"}}), + (SegmentEncoderTransform(), "regular_ts", {"create": {"segment_code"}}), + # feature_selection + (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {"remove": {"year"}}), + ( + GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {"remove": {"month", "year", "weekday"}}, + ), + ( + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + "ts_with_exog", + {"remove": {"weekday", "monthday", "positive"}}, + ), + ( + TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), + "ts_with_exog", + {"remove": {"year", "month", "weekday"}}, + ), + # math + ( + AddConstTransform(in_column="target", value=1, inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (AddConstTransform(in_column="target", value=1, inplace=True), "regular_ts", {}), + (AddConstTransform(in_column="positive", value=1, inplace=True), "ts_with_exog", {"change": {"positive"}}), + ( + LagTransform(in_column="target", lags=[1, 2, 3], out_column="res"), + "regular_ts", + {"create": {"res_1", "res_2", "res_3"}}, + ), + ( + LambdaTransform(in_column="target", transform_func=lambda x: x + 1, inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + ( + LambdaTransform( + in_column="target", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "regular_ts", + {}, + ), + ( + LambdaTransform( + in_column="positive", + transform_func=lambda x: x + 1, + inverse_transform_func=lambda x: x - 1, + inplace=True, + ), + "ts_with_exog", + {"change": {"positive"}}, + ), + (LogTransform(in_column="target", inplace=False, out_column="res"), "positive_ts", {"create": {"res"}}), + (LogTransform(in_column="target", inplace=True), "positive_ts", {}), + (LogTransform(in_column="positive", inplace=True), "ts_with_exog", {"change": {"positive"}}), + ( + DifferencingTransform(in_column="target", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (DifferencingTransform(in_column="target", inplace=True), "regular_ts", {}), + (DifferencingTransform(in_column="positive", inplace=True), "ts_with_exog", {"change": {"positive"}}), + (MADTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + (MaxTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + (MeanTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + (MedianTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + MinMaxDifferenceTransform(in_column="target", window=14, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (MinTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + QuantileTransform(in_column="target", quantile=0.9, window=14, out_column="res"), + "regular_ts", + {"create": {"res"}}, + ), + (StdTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + (SumTransform(in_column="target", window=14, out_column="res"), "regular_ts", {"create": {"res"}}), + ( + BoxCoxTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "positive_ts", + {"create": {"res_target"}}, + ), + (BoxCoxTransform(in_column="target", mode="per-segment", inplace=True), "positive_ts", {}), + ( + BoxCoxTransform(in_column="positive", mode="per-segment", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + BoxCoxTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "positive_ts", + {"create": {"res_target"}}, + ), + (BoxCoxTransform(in_column="target", mode="macro", inplace=True), "positive_ts", {}), + ( + BoxCoxTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + (MaxAbsScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts", {}), + ( + MaxAbsScalerTransform(in_column="positive", mode="per-segment", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + MaxAbsScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + MaxAbsScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + (MinMaxScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts", {}), + ( + MinMaxScalerTransform(in_column="positive", mode="per-segment", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + MinMaxScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + MinMaxScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + (RobustScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts", {}), + ( + RobustScalerTransform(in_column="positive", mode="per-segment", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + RobustScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + RobustScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + (StandardScalerTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts", {}), + ( + StandardScalerTransform(in_column="positive", mode="per-segment", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + ( + StandardScalerTransform(in_column="target", mode="macro", inplace=True), + "regular_ts", + {}, + ), + ( + StandardScalerTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + (YeoJohnsonTransform(in_column="target", mode="per-segment", inplace=True), "regular_ts", {}), + ( + YeoJohnsonTransform(in_column="positive", mode="per-segment", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + ( + YeoJohnsonTransform(in_column="target", mode="macro", inplace=False, out_column="res"), + "regular_ts", + {"create": {"res_target"}}, + ), + (YeoJohnsonTransform(in_column="target", mode="macro", inplace=True), "regular_ts", {}), + ( + YeoJohnsonTransform(in_column="positive", mode="macro", inplace=True), + "ts_with_exog", + {"change": {"positive"}}, + ), + # missing_values + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=False, out_column="res" + ), + "ts_to_resample", + {"create": {"res"}}, + ), + ( + ResampleWithDistributionTransform( + in_column="regressor_exog", distribution_column="target", inplace=True + ), + "ts_to_resample", + {"change": {"regressor_exog"}}, + ), + ( + # this behaviour can be unexpected for someone + TimeSeriesImputerTransform(in_column="target"), + "ts_to_fill", + {}, + ), + # outliers + (DensityOutliersTransform(in_column="target"), "ts_with_outliers", {}), + (MedianOutliersTransform(in_column="target"), "ts_with_outliers", {}), + (PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), "ts_with_outliers", {}), + # timestamp + ( + DateFlagsTransform(out_column="res"), + "regular_ts", + {"create": {"res_day_number_in_week", "res_day_number_in_month", "res_is_weekend"}}, + ), + ( + FourierTransform(period=7, order=2, out_column="res"), + "regular_ts", + {"create": {"res_1", "res_2", "res_3", "res_4"}}, + ), + (HolidayTransform(out_column="res"), "regular_ts", {"create": {"res"}}), + ( + TimeFlagsTransform(out_column="res"), + "regular_ts", + {"create": {"res_minute_in_hour_number", "res_hour_number"}}, + ), + (SpecialDaysTransform(), "regular_ts", {"create": {"anomaly_weekdays", "anomaly_monthdays"}}), + ], + ) + def test_transform_future_without_target(self, transform, dataset_name, expected_changes, request): + ts = request.getfixturevalue(dataset_name) + self._test_transform_future_without_target(ts, transform, expected_changes=expected_changes) diff --git a/tests/test_transforms/test_math/test_differencing_transform.py b/tests/test_transforms/test_math/test_differencing_transform.py index 717471c26..ee2ea1ed2 100644 --- a/tests/test_transforms/test_math/test_differencing_transform.py +++ b/tests/test_transforms/test_math/test_differencing_transform.py @@ -1,4 +1,5 @@ from typing import List +from typing import Tuple from typing import Union import numpy as np @@ -39,6 +40,26 @@ def df_nans() -> pd.DataFrame: return df +@pytest.fixture +def df_nans_middle() -> pd.DataFrame: + """Create DataFrame with nans in the middle of the segment.""" + timestamp = pd.date_range("2021-01-01", "2021-04-01") + df_1 = pd.DataFrame({"timestamp": timestamp, "target": np.arange(timestamp.shape[0]), "segment": "1"}) + df_2 = pd.DataFrame({"timestamp": timestamp, "target": np.arange(timestamp.shape[0]) * 2, "segment": "2"}) + df = pd.concat([df_1, df_2], ignore_index=True) + df = TSDataset.to_dataset(df) + df.iloc[5:10, 0] = np.NaN + return df + + +@pytest.fixture +def df_segments_split(df_nans) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Create a pair of DataFrames with different segments.""" + train_df = df_nans.loc[:, pd.IndexSlice["1", :]] + test_df = df_nans.loc[:, pd.IndexSlice["2", :]] + return train_df, test_df + + @pytest.fixture def df_regressors(df_nans) -> pd.DataFrame: """Create df_exog for df_nans.""" @@ -82,29 +103,33 @@ def check_interface_transform_autogenerate_column_regressor( def check_transform( - transform: GeneralDifferencingTransform, period: int, order: int, out_column: str, df: pd.DataFrame + transform: GeneralDifferencingTransform, + period: int, + order: int, + out_column: str, + fit_df: pd.DataFrame, + df: pd.DataFrame, ): """Check that differencing transform generates correct values in transform.""" - transformed_df = transform.fit_transform(df) + transform.fit(fit_df) + transformed_df = transform.transform(df) for segment in df.columns.get_level_values("segment").unique(): series_init = df.loc[:, pd.IndexSlice[segment, "target"]] series_transformed = transformed_df.loc[:, pd.IndexSlice[segment, out_column]] - series_init = series_init.loc[series_init.first_valid_index() :] - series_transformed = series_transformed.loc[series_transformed.first_valid_index() :] - - assert series_init.shape[0] == series_transformed.shape[0] + order * period - for _ in range(order): - series_init = series_init.diff(periods=period).iloc[period:] + series_init = series_init.diff(periods=period) - assert np.all(series_init == series_transformed) + assert series_init.equals(series_transformed) -def check_inverse_transform_not_inplace(transform: GeneralDifferencingTransform, df: pd.DataFrame): +def check_inverse_transform_not_inplace( + transform: GeneralDifferencingTransform, train_df: pd.DataFrame, test_df: pd.DataFrame +): """Check that differencing transform does nothing during inverse_transform in non-inplace mode.""" - transformed_df = transform.fit_transform(df) + transform.fit_transform(train_df) + transformed_df = transform.transform(test_df) inverse_transformed_df = transform.inverse_transform(transformed_df) assert transformed_df.equals(inverse_transformed_df) @@ -116,10 +141,10 @@ def check_inverse_transform_inplace_train(transform: GeneralDifferencingTransfor assert inverse_transformed_df.equals(df) -def check_inverse_transform_inplace_test( +def check_inverse_transform_inplace_filled_test( transform: GeneralDifferencingTransform, period: int, order: int, df: pd.DataFrame ): - """Check that differencing transform correctly makes inverse_transform on test data in inplace mode.""" + """Check that differencing transform correctly makes inverse_transform on filled test data in inplace mode.""" ts = TSDataset(df, freq="D") ts_train, ts_test = ts.train_test_split(test_size=20) ts_train.fit_transform(transforms=[transform]) @@ -140,6 +165,19 @@ def check_inverse_transform_inplace_test( assert np.all(future_ts.to_pandas() == ts_test.to_pandas()) +def check_inverse_transform_inplace_unfilled_test(transform: GeneralDifferencingTransform, df: pd.DataFrame): + """Check that differencing transform correctly makes inverse_transform on unfilled test data in inplace mode.""" + ts = TSDataset(df, freq="D") + ts_train, ts_test = ts.train_test_split(test_size=20) + ts_train.fit_transform(transforms=[transform]) + + future_ts = ts_train.make_future(20) + + # check values from inverse_transform + future_ts.inverse_transform() + assert future_ts.to_pandas().isna().all().all() + + def check_inverse_transform_inplace_test_quantiles(transform: GeneralDifferencingTransform, df: pd.DataFrame): """Check that differencing transform correctly makes inverse_transform on test data with quantiles.""" ts = TSDataset(df, freq="D") @@ -257,7 +295,7 @@ def test_general_interface_transform_inplace(transform, df_nans): DifferencingTransform(in_column="target", period=1, order=1, inplace=False, out_column="diff"), ], ) -def test_general_transform_not_inplace(transform, df_nans): +def test_general_interface_transform_not_inplace(transform, df_nans): """Test that differencing transform doesn't change in_column in transform in non-inplace mode.""" transformed_df = transform.fit_transform(df_nans) @@ -281,25 +319,39 @@ def test_general_fit_fail_nans(transform, df_nans): transform.fit(df_nans) -@pytest.mark.parametrize( - "transform", - [ - _SingleDifferencingTransform(in_column="target", period=1, inplace=False, out_column="diff"), - DifferencingTransform(in_column="target", period=1, order=1, inplace=False, out_column="diff"), - ], -) -def test_general_transform_fail_not_fitted(transform, df_nans): - """Test that differencing transform fails to make transform before fitting.""" - with pytest.raises(AttributeError, match="Transform is not fitted"): +@pytest.mark.parametrize("inplace, out_column", [(False, "diff"), (True, "target")]) +def test_full_transform_fail_not_fitted(inplace, out_column, df_nans): + """Test that DifferencingTransform transform fails to make transform before fitting.""" + transform = DifferencingTransform(in_column="target", inplace=inplace, out_column=out_column) + with pytest.raises(ValueError, match="Transform is not fitted"): _ = transform.transform(df_nans) +@pytest.mark.parametrize("period", [1, 7]) +def test_single_transform_inplace_new_segments(period, df_segments_split): + """Test that _SingleDifferencingTransform generates correct values in transform on new segments in inplace mode.""" + train_df, test_df = df_segments_split + transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=True) + check_transform(transform, period, 1, "target", train_df, test_df) + + +def test_full_transform_inplace_fail_new_segments(df_segments_split): + """Test that DifferencingTransform transform fails to make transform if new segments are present in inplace mode.""" + train_df, test_df = df_segments_split + transform = DifferencingTransform(in_column="target", period=1, order=1, inplace=True) + transform.fit(train_df) + with pytest.raises( + NotImplementedError, match="This transform can't process segments that weren't present on train data" + ): + _ = transform.transform(test_df) + + @pytest.mark.parametrize("period", [1, 7]) @pytest.mark.parametrize("inplace, out_column", [(False, "diff"), (True, "target")]) def test_single_transform(period, inplace, out_column, df_nans): """Test that _SingleDifferencingTransform generates correct values in transform.""" transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=inplace, out_column=out_column) - check_transform(transform, period, 1, out_column, df_nans) + check_transform(transform, period, 1, out_column, df_nans, df_nans) @pytest.mark.parametrize("period", [1, 7]) @@ -310,22 +362,68 @@ def test_full_transform(period, order, inplace, out_column, df_nans): transform = DifferencingTransform( in_column="target", period=period, order=order, inplace=inplace, out_column=out_column ) - check_transform(transform, period, order, out_column, df_nans) + check_transform(transform, period, order, out_column, df_nans, df_nans) -@pytest.mark.parametrize( - "transform", - [ - _SingleDifferencingTransform(in_column="target", period=1, inplace=True), - DifferencingTransform(in_column="target", period=1, order=1, inplace=True), - ], -) -def test_general_inverse_transform_fail_not_fitted(transform, df_nans): - """Test that differencing transform fails to make inverse_transform before fitting.""" - with pytest.raises(AttributeError, match="Transform is not fitted"): +@pytest.mark.parametrize("period", [1, 7]) +@pytest.mark.parametrize("inplace, out_column", [(False, "diff"), (True, "target")]) +def test_single_transform_nans_middle(period, inplace, out_column, df_nans, df_nans_middle): + """Test that _SingleDifferencingTransform generates correct values in transform with NaNs in the middle.""" + transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=inplace, out_column=out_column) + check_transform(transform, period, 1, out_column, df_nans, df_nans_middle) + + +@pytest.mark.parametrize("period", [1, 7]) +@pytest.mark.parametrize("order", [1, 2]) +@pytest.mark.parametrize("inplace, out_column", [(False, "diff"), (True, "target")]) +def test_full_transform_nans_middle(period, order, inplace, out_column, df_nans, df_nans_middle): + """Test that DifferencingTransform generates correct values in transform with NaNs in the middle.""" + transform = DifferencingTransform( + in_column="target", period=period, order=order, inplace=inplace, out_column=out_column + ) + check_transform(transform, period, order, out_column, df_nans, df_nans_middle) + + +@pytest.mark.parametrize("period", [1, 7]) +def test_single_transform_not_inplace_new_segments(period, df_segments_split): + """Test that _SingleDifferencingTransform generates correct values in transform on new segments in non-inplace mode.""" + train_df, test_df = df_segments_split + out_column = "diff" + transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=False, out_column=out_column) + check_transform(transform, period, 1, out_column, train_df, test_df) + + +@pytest.mark.parametrize("period", [1, 7]) +@pytest.mark.parametrize("order", [1, 2]) +def test_full_transform_not_inplace_new_segments(period, order, df_segments_split): + """Test that DifferencingTransform generates correct values in transform on new segments in non-inplace mode.""" + train_df, test_df = df_segments_split + out_column = "diff" + transform = DifferencingTransform( + in_column="target", period=period, order=order, inplace=False, out_column=out_column + ) + check_transform(transform, period, order, out_column, train_df, test_df) + + +@pytest.mark.parametrize("inplace, out_column", [(False, "diff"), (True, "target")]) +def test_full_inverse_transform_fail_not_fitted(inplace, out_column, df_nans): + """Test that DifferencingTransform fails to make inverse_transform before fitting.""" + transform = DifferencingTransform(in_column="target", inplace=inplace, out_column=out_column) + with pytest.raises(ValueError, match="Transform is not fitted"): _ = transform.inverse_transform(df_nans) +def test_full_inverse_transform_inplace_fail_new_segments(df_segments_split): + """Test that DifferencingTransform fails to make inverse_transform if new segments are present in inplace mode.""" + train_df, test_df = df_segments_split + transform = DifferencingTransform(in_column="target", period=1, order=1, inplace=True) + transform.fit(train_df) + with pytest.raises( + NotImplementedError, match="This transform can't process segments that weren't present on train data" + ): + _ = transform.inverse_transform(test_df) + + @pytest.mark.parametrize( "transform", [ @@ -364,7 +462,7 @@ def test_general_inverse_transform_fail_test_not_right_after_train(transform, df def test_single_inverse_transform_not_inplace(period, df_nans): """Test that _SingleDifferencingTransform does nothing during inverse_transform in non-inplace mode.""" transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=False, out_column="diff") - check_inverse_transform_not_inplace(transform, df_nans) + check_inverse_transform_not_inplace(transform, df_nans, df_nans) @pytest.mark.parametrize("period", [1, 7]) @@ -372,7 +470,24 @@ def test_single_inverse_transform_not_inplace(period, df_nans): def test_full_inverse_transform_not_inplace(period, order, df_nans): """Test that DifferencingTransform does nothing during inverse_transform in non-inplace mode.""" transform = DifferencingTransform(in_column="target", period=period, order=order, inplace=False, out_column="diff") - check_inverse_transform_not_inplace(transform, df_nans) + check_inverse_transform_not_inplace(transform, df_nans, df_nans) + + +@pytest.mark.parametrize("period", [1, 7]) +def test_single_inverse_transform_not_inplace_new_segments(period, df_segments_split): + """Test that _SingleDifferencingTransform does nothing during inverse_transform on new segments in non-inplace mode.""" + train_df, test_df = df_segments_split + transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=False, out_column="diff") + check_inverse_transform_not_inplace(transform, train_df, test_df) + + +@pytest.mark.parametrize("period", [1, 7]) +@pytest.mark.parametrize("order", [1, 2]) +def test_full_inverse_transform_not_inplace_new_segments(period, order, df_segments_split): + """Test that DifferencingTransform does nothing during inverse_transform on new segments in non-inplace mode.""" + train_df, test_df = df_segments_split + transform = DifferencingTransform(in_column="target", period=period, order=order, inplace=False, out_column="diff") + check_inverse_transform_not_inplace(transform, train_df, test_df) @pytest.mark.parametrize("period", [1, 7]) @@ -390,43 +505,34 @@ def test_full_inverse_transform_inplace_train(period, order, df_nans): check_inverse_transform_inplace_train(transform, df_nans) -@pytest.mark.parametrize( - "transform", - [ - _SingleDifferencingTransform(in_column="target", period=1, inplace=True), - DifferencingTransform(in_column="target", period=1, order=1, inplace=True), - ], -) -def test_general_inverse_transform_inplace_test_fail_nans(transform, df_nans): - """Test that differencing transform fails to make inverse_transform on test data if there are NaNs.""" - ts = TSDataset(df_nans, freq="D") - ts_train, ts_test = ts.train_test_split(test_size=20) - - ts_train.fit_transform(transforms=[transform]) +@pytest.mark.parametrize("period", [1, 7]) +def test_single_inverse_transform_inplace_filled_test(period, df_nans): + """Test that _SingleDifferencingTransform correctly makes inverse_transform on filled test data in inplace mode.""" + transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=True) + check_inverse_transform_inplace_filled_test(transform, period, 1, df_nans) - # make predictions by hand only on one segment - future_ts = ts_train.make_future(20) - future_ts.df.loc[:, pd.IndexSlice["1", "target"]] = np.NaN - future_ts.df.loc[:, pd.IndexSlice["2", "target"]] = 2 - # check fail on inverse_transform - with pytest.raises(ValueError, match="There should be no NaNs inside the segments"): - future_ts.inverse_transform() +@pytest.mark.parametrize("period", [1, 7]) +@pytest.mark.parametrize("order", [1, 2]) +def test_full_inverse_transform_inplace_test(period, order, df_nans): + """Test that DifferencingTransform correctly makes inverse_transform on filled test data in inplace mode.""" + transform = DifferencingTransform(in_column="target", period=period, order=order, inplace=True) + check_inverse_transform_inplace_filled_test(transform, period, order, df_nans) @pytest.mark.parametrize("period", [1, 7]) def test_single_inverse_transform_inplace_test(period, df_nans): - """Test that _SingleDifferencingTransform correctly makes inverse_transform on test data in inplace mode.""" + """Test that _SingleDifferencingTransform correctly makes inverse_transform on unfilled test data in inplace mode.""" transform = _SingleDifferencingTransform(in_column="target", period=period, inplace=True) - check_inverse_transform_inplace_test(transform, period, 1, df_nans) + check_inverse_transform_inplace_unfilled_test(transform, df_nans) @pytest.mark.parametrize("period", [1, 7]) @pytest.mark.parametrize("order", [1, 2]) def test_full_inverse_transform_inplace_test(period, order, df_nans): - """Test that DifferencingTransform correctly makes inverse_transform on test data in inplace mode.""" + """Test that DifferencingTransform correctly makes inverse_transform on unfilled test data in inplace mode.""" transform = DifferencingTransform(in_column="target", period=period, order=order, inplace=True) - check_inverse_transform_inplace_test(transform, period, order, df_nans) + check_inverse_transform_inplace_unfilled_test(transform, df_nans) @pytest.mark.parametrize("period", [1, 7]) diff --git a/tests/test_transforms/test_math/test_sklearn_transform_interface.py b/tests/test_transforms/test_math/test_sklearn_transform_interface.py index f38d6f098..1ad8a9650 100644 --- a/tests/test_transforms/test_math/test_sklearn_transform_interface.py +++ b/tests/test_transforms/test_math/test_sklearn_transform_interface.py @@ -278,3 +278,335 @@ def test_ordering(transform_constructor, in_column, mode, multicolumn_ts): df_multi = transformed_df.loc[:, pd.IndexSlice[segments, column_multi]] df_single = transformed_dfs_one_column[i].loc[:, pd.IndexSlice[segments, column_single]] assert np.all(df_multi == df_single) + + +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +@pytest.mark.parametrize( + "mode", + [ + "macro", + "per-segment", + ], +) +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +def test_transform_not_fitted_fail(transform_constructor, mode, in_column, inplace, multicolumn_ts): + df = multicolumn_ts.to_pandas() + transform = transform_constructor(mode=mode, in_column=in_column, inplace=inplace) + + with pytest.raises(ValueError, match="The transform isn't fitted"): + _ = transform.transform(df) + + +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +@pytest.mark.parametrize( + "mode", + [ + "macro", + "per-segment", + ], +) +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +def test_inverse_transform_not_fitted_fail(transform_constructor, mode, in_column, inplace, multicolumn_ts): + df = multicolumn_ts.to_pandas() + transform = transform_constructor(mode=mode, in_column=in_column, inplace=inplace) + + with pytest.raises(ValueError, match="The transform isn't fitted"): + _ = transform.inverse_transform(df) + + +def _check_same_segments(df_1: pd.DataFrame, df_2: pd.DataFrame): + df_1_segments = set(df_1.columns.get_level_values("segment")) + df_2_segments = set(df_2.columns.get_level_values("segment")) + assert df_1_segments == df_2_segments + + +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +@pytest.mark.parametrize( + "mode", + [ + "macro", + "per-segment", + ], +) +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +def test_transform_subset_segments(transform_constructor, mode, in_column, inplace, multicolumn_ts): + df = multicolumn_ts.to_pandas() + train_df = df + test_df = df.loc[:, pd.IndexSlice[["segment_0", "segment_2"], :]] + transform = transform_constructor(mode=mode, in_column=in_column, inplace=inplace) + + transform.fit(train_df) + transformed_df = transform.transform(test_df) + + _check_same_segments(transformed_df, test_df) + + +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +@pytest.mark.parametrize( + "mode", + [ + "macro", + "per-segment", + ], +) +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +def test_inverse_transform_subset_segments(transform_constructor, mode, in_column, inplace, multicolumn_ts): + df = multicolumn_ts.to_pandas() + train_df = df + test_df = df.loc[:, pd.IndexSlice[["segment_0", "segment_2"], :]] + transform = transform_constructor(mode=mode, in_column=in_column, inplace=inplace) + + transform.fit(train_df) + inv_transformed_df = transform.inverse_transform(test_df) + + _check_same_segments(inv_transformed_df, test_df) + + +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +def test_transform_new_segments_macro(transform_constructor, in_column, inplace, multicolumn_ts): + df = multicolumn_ts.to_pandas() + train_df = df.loc[:, pd.IndexSlice[["segment_0", "segment_1"], :]] + test_df = df.loc[:, pd.IndexSlice["segment_2", :]] + transform = transform_constructor(mode="macro", in_column=in_column, inplace=inplace) + + transform.fit(train_df) + transformed_df = transform.transform(test_df) + + _check_same_segments(transformed_df, test_df) + + +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +def test_transform_new_segments_per_segment_fail(transform_constructor, in_column, inplace, multicolumn_ts): + df = multicolumn_ts.to_pandas() + train_df = df.loc[:, pd.IndexSlice[["segment_0", "segment_1"], :]] + test_df = df.loc[:, pd.IndexSlice["segment_2", :]] + transform = transform_constructor(mode="per-segment", in_column=in_column, inplace=inplace) + + transform.fit(train_df) + with pytest.raises( + NotImplementedError, match="This transform can't process segments that weren't present on train data" + ): + _ = transform.transform(test_df) + + +@pytest.mark.parametrize("inplace", [False, True]) +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +def test_inverse_transform_new_segments_macro(transform_constructor, in_column, inplace, multicolumn_ts): + df = multicolumn_ts.to_pandas() + train_df = df.loc[:, pd.IndexSlice[["segment_0", "segment_1"], :]] + test_df = df.loc[:, pd.IndexSlice["segment_2", :]] + transform = transform_constructor(mode="macro", in_column=in_column, inplace=inplace) + + transform.fit(train_df) + transformed_df = transform.inverse_transform(test_df) + + _check_same_segments(transformed_df, test_df) + + +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +def test_inverse_transform_new_segments_per_segment_non_inplace(transform_constructor, in_column, multicolumn_ts): + df = multicolumn_ts.to_pandas() + train_df = df.loc[:, pd.IndexSlice[["segment_0", "segment_1"], :]] + test_df = df.loc[:, pd.IndexSlice["segment_2", :]] + transform = transform_constructor(mode="per-segment", in_column=in_column, inplace=False) + + transform.fit(train_df) + inv_transformed_df = transform.inverse_transform(test_df) + + pd.testing.assert_frame_equal(inv_transformed_df, test_df) + + +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +def test_inverse_transform_new_segments_per_segment_inplace_fail(transform_constructor, in_column, multicolumn_ts): + df = multicolumn_ts.to_pandas() + train_df = df.loc[:, pd.IndexSlice[["segment_0", "segment_1"], :]] + test_df = df.loc[:, pd.IndexSlice["segment_2", :]] + transform = transform_constructor(mode="per-segment", in_column=in_column, inplace=True) + + transform.fit(train_df) + with pytest.raises( + NotImplementedError, match="This transform can't process segments that weren't present on train data" + ): + _ = transform.inverse_transform(test_df) diff --git a/tests/test_transforms/test_outliers/test_outliers_transform.py b/tests/test_transforms/test_outliers/test_outliers_transform.py index 3bb734f3b..d6de47fd4 100644 --- a/tests/test_transforms/test_outliers/test_outliers_transform.py +++ b/tests/test_transforms/test_outliers/test_outliers_transform.py @@ -155,6 +155,46 @@ def test_inverse_transform_raise_error_if_not_fitted(transform, outliers_solid_t _ = transform.inverse_transform(df=outliers_solid_tsds.df) +@pytest.mark.parametrize( + "transform", + ( + MedianOutliersTransform(in_column="target"), + DensityOutliersTransform(in_column="target"), + PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), + ), +) +def test_transform_new_segments_fail(transform, outliers_solid_tsds): + df = outliers_solid_tsds.to_pandas() + train_df = df.loc[:, pd.IndexSlice["1", :]] + test_df = df.loc[:, pd.IndexSlice["2", :]] + + transform.fit(train_df) + with pytest.raises( + NotImplementedError, match="This transform can't process segments that weren't present on train data" + ): + _ = transform.transform(test_df) + + +@pytest.mark.parametrize( + "transform", + ( + MedianOutliersTransform(in_column="target"), + DensityOutliersTransform(in_column="target"), + PredictionIntervalOutliersTransform(in_column="target", model=ProphetModel), + ), +) +def test_inverse_transform_new_segments_fail(transform, outliers_solid_tsds): + df = outliers_solid_tsds.to_pandas() + train_df = df.loc[:, pd.IndexSlice["1", :]] + test_df = df.loc[:, pd.IndexSlice["2", :]] + + transform.fit(train_df) + with pytest.raises( + NotImplementedError, match="This transform can't process segments that weren't present on train data" + ): + _ = transform.inverse_transform(test_df) + + @pytest.mark.parametrize( "transform", ( diff --git a/tests/utils.py b/tests/utils.py index e279d7ec9..3da2bee25 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,9 +1,38 @@ +import functools +from typing import List + import numpy as np +import pandas as pd +import pytest +from etna.datasets import TSDataset from etna.metrics.base import Metric from etna.metrics.base import MetricAggregationMode +def to_be_fixed(raises, match=None): + def to_be_fixed_concrete(func): + @functools.wraps(func) + def wrapped_test(*args, **kwargs): + with pytest.raises(raises, match=match): + return func(*args, **kwargs) + + return wrapped_test + + return to_be_fixed_concrete + + +def select_segments_subset(ts: TSDataset, segments: List[str]) -> TSDataset: + df = ts.raw_df.loc[:, pd.IndexSlice[segments, :]] + df = df.loc[ts.df.index] + df_exog = ts.df_exog + if df_exog is not None: + df_exog = df_exog.loc[:, pd.IndexSlice[segments, :]] + known_future = ts.known_future + freq = ts.freq + return TSDataset(df=df, df_exog=df_exog, known_future=known_future, freq=freq) + + def create_dummy_functional_metric(alpha: float = 1.0): def dummy_functional_metric(y_true: np.ndarray, y_pred: np.ndarray) -> float: return alpha