From a004b97bf0b89dd24a2a3556e63aeff1d5778144 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Mon, 1 Aug 2022 13:54:16 +0300 Subject: [PATCH 01/23] Add get_regresors_info --- etna/transforms/base.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/etna/transforms/base.py b/etna/transforms/base.py index f01c39ba5..f87a23166 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -1,6 +1,7 @@ from abc import ABC from abc import abstractmethod from copy import deepcopy +from typing import List import pandas as pd @@ -20,6 +21,16 @@ class DymmyInColumnMixin: class Transform(ABC, BaseMixin): """Base class to create any transforms to apply to data.""" + def get_regressors_info(self) -> List[str]: + """Return the list with regressors created by the transform. + + Returns + ------- + : + List with regressors created by the transform. + """ + return [] + @abstractmethod def fit(self, df: pd.DataFrame) -> "Transform": """Fit feature model. From df117a9eaa43bf9744a1b668c13408052eb5a109 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Thu, 4 Aug 2022 09:23:29 +0300 Subject: [PATCH 02/23] Fix method naming for remove_columns --- etna/datasets/tsdataset.py | 15 ++++---- etna/transforms/base.py | 40 ++++++++++++++++++-- etna/transforms/missing_values/imputation.py | 2 +- tests/test_datasets/test_dataset.py | 12 +++--- 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index d89a1b6bd..ee9cba93b 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -900,21 +900,20 @@ def update_columns_from_pandas(self, df: pd.DataFrame, regressors: Optional[List if regressors is not None: self._regressors = list(set(self._regressors) | set(regressors)) - def remove_columns(self, columns: List[str]): + def remove_features(self, features: List[str]): """Remove columns from the dataset. Columns that are not presented in the dataset will be ignored Parameters ---------- - columns: - List of columns to be removed + features: + List of features to be removed """ - for df in [self.df, self.df_exog, self.raw_df]: - columns_in_df = df.columns.get_level_values("feature") - columns_to_remove = list(set(columns_in_df) & set(columns)) - df.drop(columns=columns_to_remove, level="feature", inplace=True) - self._regressors = list(set(self._regressors) - set(columns)) + columns_in_df = self.df.columns.get_level_values("feature") + columns_to_remove = list(set(columns_in_df) & set(features)) + self.df.drop(columns=columns_to_remove, level="feature", inplace=True) + self._regressors = list(set(self._regressors) - set(features)) @property def index(self) -> pd.core.indexes.datetimes.DatetimeIndex: diff --git a/etna/transforms/base.py b/etna/transforms/base.py index f87a23166..5c801600b 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -2,10 +2,13 @@ from abc import abstractmethod from copy import deepcopy from typing import List +from typing import Union import pandas as pd +from typing_extensions import Literal from etna.core import BaseMixin +from etna.datasets import TSDataset class FutureMixin: @@ -21,6 +24,8 @@ class DymmyInColumnMixin: class Transform(ABC, BaseMixin): """Base class to create any transforms to apply to data.""" + in_column: Union[Literal["all"], List[str], str] = "target" + def get_regressors_info(self) -> List[str]: """Return the list with regressors created by the transform. @@ -31,22 +36,49 @@ def get_regressors_info(self) -> List[str]: """ return [] + def _get_required_features(self) -> Union[Literal["all"], List[str]]: + """Get the list of required features.""" + required_features = self.in_column + if isinstance(required_features, str) and required_features != "all": + required_features = [required_features] + return required_features + @abstractmethod - def fit(self, df: pd.DataFrame) -> "Transform": - """Fit feature model. + def _fit(self, df: pd.DataFrame) -> "Transform": + """Fit the transform. Should be implemented by user. Parameters ---------- - df + df: + Dataframe in etna wide format. Returns ------- : + The fitted transform instance. """ pass + def fit(self, ts: TSDataset) -> "Transform": + """Fit the transform. + + Parameters + ---------- + ts: + Dataset to fit the transform on. + + Returns + ------- + : + The fitted transform instance. + """ + features_to_use = self._get_required_features() + df = ts.to_pandas(flatten=False, features=features_to_use) + self._fit(df=df) + return self + @abstractmethod def transform(self, df: pd.DataFrame) -> pd.DataFrame: """Transform dataframe. @@ -104,7 +136,7 @@ def fit(self, df: pd.DataFrame) -> "PerSegmentWrapper": self.segments = df.columns.get_level_values(0).unique() for segment in self.segments: self.segment_transforms[segment] = deepcopy(self._base_transform) - self.segment_transforms[segment].fit(df[segment]) + self.segment_transforms[segment]._fit(df=df[segment]) return self def transform(self, df: pd.DataFrame) -> pd.DataFrame: diff --git a/etna/transforms/missing_values/imputation.py b/etna/transforms/missing_values/imputation.py index 85450a2f0..4fb89e75b 100644 --- a/etna/transforms/missing_values/imputation.py +++ b/etna/transforms/missing_values/imputation.py @@ -89,7 +89,7 @@ def __init__( self.fill_value: Optional[int] = None self.nan_timestamps: Optional[List[pd.Timestamp]] = None - def fit(self, df: pd.DataFrame) -> "_OneSegmentTimeSeriesImputerTransform": + def _fit(self, df: pd.DataFrame) -> "_OneSegmentTimeSeriesImputerTransform": """ Fit preprocess params. diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index e99f0b06d..29a4c1e30 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -962,29 +962,29 @@ def test_update_columns_update_regressors(df_and_regressors, known_future, regre @pytest.mark.parametrize( - "columns, expected_columns", + "features, expected_columns", ( (["regressor_2"], ["timestamp", "segment", "target", "regressor_1"]), (["regressor_2", "out_of_dataset_column"], ["timestamp", "segment", "target", "regressor_1"]), ), ) -def test_remove_columns(df_and_regressors, columns, expected_columns): +def test_remove_features(df_and_regressors, features, expected_columns): df, df_exog, known_future = df_and_regressors ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) - ts.remove_columns(columns=columns) + ts.remove_features(features=features) got_columns = ts.to_flatten(ts.df).columns assert sorted(got_columns) == sorted(expected_columns) @pytest.mark.parametrize( - "columns, expected_regressors", + "features, expected_regressors", ( (["target", "regressor_2"], ["regressor_1"]), (["out_of_dataset_column"], ["regressor_1", "regressor_2"]), ), ) -def test_remove_columns_update_regressors(df_and_regressors, columns, expected_regressors): +def test_remove_features_update_regressors(df_and_regressors, features, expected_regressors): df, df_exog, known_future = df_and_regressors ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) - ts.remove_columns(columns=columns) + ts.remove_features(features=features) assert sorted(ts.regressors) == sorted(expected_regressors) From d7e6ffef0a171865cdaa18f3e1982af3b6c9fdcb Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Thu, 4 Aug 2022 09:25:19 +0300 Subject: [PATCH 03/23] Add new version of base class for transforms --- etna/transforms/base.py | 133 ++++++++++++++++++- etna/transforms/missing_values/imputation.py | 2 +- 2 files changed, 127 insertions(+), 8 deletions(-) diff --git a/etna/transforms/base.py b/etna/transforms/base.py index 5c801600b..d81afbdd6 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -21,7 +21,7 @@ class DymmyInColumnMixin: in_column = "target" -class Transform(ABC, BaseMixin): +class NewTransform(ABC, BaseMixin): """Base class to create any transforms to apply to data.""" in_column: Union[Literal["all"], List[str], str] = "target" @@ -36,15 +36,30 @@ def get_regressors_info(self) -> List[str]: """ return [] - def _get_required_features(self) -> Union[Literal["all"], List[str]]: + @property + def required_features(self) -> Union[Literal["all"], List[str]]: """Get the list of required features.""" required_features = self.in_column if isinstance(required_features, str) and required_features != "all": required_features = [required_features] return required_features + def _update_dataset(self, ts: TSDataset, df: pd.DataFrame, df_transformed: pd.DataFrame) -> TSDataset: + """Update TSDataset based on the difference between dfs.""" + columns_before = set(df.columns.get_level_values("features")) + columns_after = set(df_transformed.columns.get_level_values("features")) + + # Transforms now can only remove or only add/update columns + removed_features = list(columns_before - columns_after) + if len(removed_features) != 0: + ts.remove_features(features=removed_features) + else: + new_regressors = self.get_regressors_info() + ts.update_columns_from_pandas(df=df_transformed, regressors=new_regressors) + return ts + @abstractmethod - def _fit(self, df: pd.DataFrame) -> "Transform": + def _fit(self, df: pd.DataFrame) -> "NewTransform": """Fit the transform. Should be implemented by user. @@ -61,7 +76,7 @@ def _fit(self, df: pd.DataFrame) -> "Transform": """ pass - def fit(self, ts: TSDataset) -> "Transform": + def fit(self, ts: TSDataset) -> "NewTransform": """Fit the transform. Parameters @@ -74,11 +89,115 @@ def fit(self, ts: TSDataset) -> "Transform": : The fitted transform instance. """ - features_to_use = self._get_required_features() - df = ts.to_pandas(flatten=False, features=features_to_use) + df = ts.to_pandas(flatten=False, features=self.required_features) self._fit(df=df) return self + @abstractmethod + def _transform(self, df: pd.DataFrame) -> pd.DataFrame: + """Transform dataframe. + + Should be implemented by user + + Parameters + ---------- + df: + Dataframe in etna wide format. + + Returns + ------- + : + Transformed Dataframe in etna wide format. + """ + pass + + def transform(self, ts: TSDataset) -> TSDataset: + """Transform TSDataset inplace. + + Parameters + ---------- + ts: + Dataset to transform. + + Returns + ------- + : + Transformed TSDataset. + """ + df = ts.to_pandas(flatten=False, features=self.required_features) + df_transformed = self._transform(df=df) + ts = self._update_dataset(ts=ts, df=df, df_transformed=df_transformed) + return ts + + def fit_transform(self, ts: TSDataset) -> TSDataset: + """Fit and transform TSDataset. + + May be reimplemented. But it is not recommended. + + Parameters + ---------- + ts: + TSDataset to transform. + + Returns + ------- + : + Transformed TSDataset. + """ + return self.fit(ts=ts).transform(ts=ts) + + def _inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: + """Inverse transform dataframe. + + Parameters + ---------- + df: + Dataframe in etna wide format. + + Returns + ------- + : + Dataframe in etna wide format after applying inverse transformation. + """ + return df + + def inverse_transform(self, ts: TSDataset) -> TSDataset: + """Inverse transform TSDataset. + + Should be reimplemented in the classes with reimplemented _inverse_transform method. + + Parameters + ---------- + ts: + TSDataset to be inverse transformed. + + Returns + ------- + : + TSDataset after applying inverse transformation. + """ + return ts + + +class Transform(ABC, BaseMixin): + """Base class to create any transforms to apply to data.""" + + @abstractmethod + def fit(self, df: pd.DataFrame) -> "Transform": + """Fit feature model. + + Should be implemented by user. + + Parameters + ---------- + df + + Returns + ------- + : + """ + pass + @abstractmethod def transform(self, df: pd.DataFrame) -> pd.DataFrame: """Transform dataframe. @@ -136,7 +255,7 @@ def fit(self, df: pd.DataFrame) -> "PerSegmentWrapper": self.segments = df.columns.get_level_values(0).unique() for segment in self.segments: self.segment_transforms[segment] = deepcopy(self._base_transform) - self.segment_transforms[segment]._fit(df=df[segment]) + self.segment_transforms[segment].fit(df[segment]) return self def transform(self, df: pd.DataFrame) -> pd.DataFrame: diff --git a/etna/transforms/missing_values/imputation.py b/etna/transforms/missing_values/imputation.py index 4fb89e75b..85450a2f0 100644 --- a/etna/transforms/missing_values/imputation.py +++ b/etna/transforms/missing_values/imputation.py @@ -89,7 +89,7 @@ def __init__( self.fill_value: Optional[int] = None self.nan_timestamps: Optional[List[pd.Timestamp]] = None - def _fit(self, df: pd.DataFrame) -> "_OneSegmentTimeSeriesImputerTransform": + def fit(self, df: pd.DataFrame) -> "_OneSegmentTimeSeriesImputerTransform": """ Fit preprocess params. From 24f87bc7c80db852e206b732a716a19c67c3895c Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Fri, 5 Aug 2022 13:16:13 +0300 Subject: [PATCH 04/23] Replace DummyInColumnMixin with default in_column in base class --- etna/transforms/base.py | 9 ++------- etna/transforms/encoders/mean_segment_encoder.py | 3 +-- etna/transforms/encoders/segment_encoder.py | 3 +-- etna/transforms/feature_selection/base.py | 2 +- etna/transforms/feature_selection/filter.py | 5 ++--- etna/transforms/nn/pytorch_forecasting.py | 5 ++--- etna/transforms/timestamp/date_flags.py | 3 +-- etna/transforms/timestamp/fourier.py | 3 +-- etna/transforms/timestamp/holiday.py | 3 +-- etna/transforms/timestamp/special_days.py | 3 +-- etna/transforms/timestamp/time_flags.py | 3 +-- 11 files changed, 14 insertions(+), 28 deletions(-) diff --git a/etna/transforms/base.py b/etna/transforms/base.py index d81afbdd6..27ecbb3ef 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -15,16 +15,11 @@ class FutureMixin: """Mixin for transforms that can convert non-regressor column to a regressor one.""" -class DymmyInColumnMixin: - """Mixin for transforms that has no explicit in_column.""" - - in_column = "target" - - class NewTransform(ABC, BaseMixin): """Base class to create any transforms to apply to data.""" - in_column: Union[Literal["all"], List[str], str] = "target" + def __init__(self): + in_column: Union[Literal["all"], List[str], str] = "target" def get_regressors_info(self) -> List[str]: """Return the list with regressors created by the transform. diff --git a/etna/transforms/encoders/mean_segment_encoder.py b/etna/transforms/encoders/mean_segment_encoder.py index 7b1a34326..8f518441c 100644 --- a/etna/transforms/encoders/mean_segment_encoder.py +++ b/etna/transforms/encoders/mean_segment_encoder.py @@ -2,12 +2,11 @@ import pandas as pd from etna.transforms import Transform -from etna.transforms.base import DymmyInColumnMixin from etna.transforms.base import FutureMixin from etna.transforms.math.statistics import MeanTransform -class MeanSegmentEncoderTransform(Transform, FutureMixin, DymmyInColumnMixin): +class MeanSegmentEncoderTransform(Transform, FutureMixin): """Makes expanding mean target encoding of the segment. Creates column 'segment_mean'.""" idx = pd.IndexSlice diff --git a/etna/transforms/encoders/segment_encoder.py b/etna/transforms/encoders/segment_encoder.py index c07bb0907..e899b8eac 100644 --- a/etna/transforms/encoders/segment_encoder.py +++ b/etna/transforms/encoders/segment_encoder.py @@ -1,12 +1,11 @@ import pandas as pd from sklearn import preprocessing -from etna.transforms.base import DymmyInColumnMixin from etna.transforms.base import FutureMixin from etna.transforms.base import Transform -class SegmentEncoderTransform(Transform, FutureMixin, DymmyInColumnMixin): +class SegmentEncoderTransform(Transform, FutureMixin): """Encode segment label to categorical. Creates column 'segment_code'.""" idx = pd.IndexSlice diff --git a/etna/transforms/feature_selection/base.py b/etna/transforms/feature_selection/base.py index 17b2aba2a..fb628d526 100644 --- a/etna/transforms/feature_selection/base.py +++ b/etna/transforms/feature_selection/base.py @@ -19,7 +19,7 @@ def __init__(self, features_to_use: Union[List[str], Literal["all"]] = "all", re self.selected_features: List[str] = [] self.return_features = return_features self._df_removed: Optional[pd.DataFrame] = None - self.in_column: Union[Sequence[str], Literal["all"]] = self.features_to_use + self.in_column = self.features_to_use def _get_features_to_use(self, df: pd.DataFrame) -> List[str]: """Get list of features from the dataframe to perform the selection on.""" diff --git a/etna/transforms/feature_selection/filter.py b/etna/transforms/feature_selection/filter.py index ddc079cb2..e13f7e3f0 100644 --- a/etna/transforms/feature_selection/filter.py +++ b/etna/transforms/feature_selection/filter.py @@ -3,14 +3,12 @@ import pandas as pd -from etna.transforms.base import DymmyInColumnMixin from etna.transforms.base import Transform -class FilterFeaturesTransform(Transform, DymmyInColumnMixin): +class FilterFeaturesTransform(Transform): """Filters features in each segment of the dataframe.""" - in_column = "all" def __init__( self, @@ -43,6 +41,7 @@ def __init__( self.exclude = list(set(exclude)) else: raise ValueError("There should be exactly one option set: include or exclude") + self.in_column = "all" def fit(self, df: pd.DataFrame) -> "FilterFeaturesTransform": """Fit method does nothing and is kept for compatibility. diff --git a/etna/transforms/nn/pytorch_forecasting.py b/etna/transforms/nn/pytorch_forecasting.py index dce5a46b5..9ae1ee3e3 100644 --- a/etna/transforms/nn/pytorch_forecasting.py +++ b/etna/transforms/nn/pytorch_forecasting.py @@ -11,7 +11,6 @@ from etna import SETTINGS from etna.datasets.tsdataset import TSDataset -from etna.transforms.base import DymmyInColumnMixin from etna.transforms.base import Transform if SETTINGS.torch_required: @@ -28,7 +27,7 @@ NORMALIZER = Union[TorchNormalizer, NaNLabelEncoder, EncoderNormalizer] -class PytorchForecastingTransform(Transform, DymmyInColumnMixin): +class PytorchForecastingTransform(Transform): """Transform for models from PytorchForecasting library. Notes @@ -36,7 +35,6 @@ class PytorchForecastingTransform(Transform, DymmyInColumnMixin): This transform should be added at the very end of ``transforms`` parameter. """ - in_column = "all" def __init__( self, @@ -93,6 +91,7 @@ def __init__( self.lags = lags if lags else {} self.scalers = scalers if scalers else {} self.pf_dataset_predict: Optional[TimeSeriesDataSet] = None + self.in_column = "all" def fit(self, df: pd.DataFrame) -> "PytorchForecastingTransform": """ diff --git a/etna/transforms/timestamp/date_flags.py b/etna/transforms/timestamp/date_flags.py index 60abe289d..9d0606263 100644 --- a/etna/transforms/timestamp/date_flags.py +++ b/etna/transforms/timestamp/date_flags.py @@ -6,12 +6,11 @@ import numpy as np import pandas as pd -from etna.transforms.base import DymmyInColumnMixin from etna.transforms.base import FutureMixin from etna.transforms.base import Transform -class DateFlagsTransform(Transform, FutureMixin, DymmyInColumnMixin): +class DateFlagsTransform(Transform, FutureMixin): """DateFlagsTransform is a class that implements extraction of the main date-based features from datetime column. Notes diff --git a/etna/transforms/timestamp/fourier.py b/etna/transforms/timestamp/fourier.py index 924da11b6..3a79a3016 100644 --- a/etna/transforms/timestamp/fourier.py +++ b/etna/transforms/timestamp/fourier.py @@ -5,12 +5,11 @@ import numpy as np import pandas as pd -from etna.transforms.base import DymmyInColumnMixin from etna.transforms.base import FutureMixin from etna.transforms.base import Transform -class FourierTransform(Transform, FutureMixin, DymmyInColumnMixin): +class FourierTransform(Transform, FutureMixin): """Adds fourier features to the dataset. Notes diff --git a/etna/transforms/timestamp/holiday.py b/etna/transforms/timestamp/holiday.py index 1f66de295..7c5b7b4a3 100644 --- a/etna/transforms/timestamp/holiday.py +++ b/etna/transforms/timestamp/holiday.py @@ -5,12 +5,11 @@ import numpy as np import pandas as pd -from etna.transforms.base import DymmyInColumnMixin from etna.transforms.base import FutureMixin from etna.transforms.base import Transform -class HolidayTransform(Transform, FutureMixin, DymmyInColumnMixin): +class HolidayTransform(Transform, FutureMixin): """HolidayTransform generates series that indicates holidays in given dataframe.""" def __init__(self, iso_code: str = "RUS", out_column: Optional[str] = None): diff --git a/etna/transforms/timestamp/special_days.py b/etna/transforms/timestamp/special_days.py index 6a13dd4b4..265257b96 100644 --- a/etna/transforms/timestamp/special_days.py +++ b/etna/transforms/timestamp/special_days.py @@ -4,7 +4,6 @@ import pandas as pd -from etna.transforms.base import DymmyInColumnMixin from etna.transforms.base import FutureMixin from etna.transforms.base import PerSegmentWrapper from etna.transforms.base import Transform @@ -167,7 +166,7 @@ def check(x): return df.loc[:, ["datetime"]].apply(check, axis=1).rename("anomaly_monthdays") -class SpecialDaysTransform(PerSegmentWrapper, FutureMixin, DymmyInColumnMixin): +class SpecialDaysTransform(PerSegmentWrapper, FutureMixin): """SpecialDaysTransform generates series that indicates is weekday/monthday is special in given dataframe. Creates columns 'anomaly_weekdays' and 'anomaly_monthdays'. diff --git a/etna/transforms/timestamp/time_flags.py b/etna/transforms/timestamp/time_flags.py index c292c16f1..b2cae7f55 100644 --- a/etna/transforms/timestamp/time_flags.py +++ b/etna/transforms/timestamp/time_flags.py @@ -4,12 +4,11 @@ import numpy as np import pandas as pd -from etna.transforms.base import DymmyInColumnMixin from etna.transforms.base import FutureMixin from etna.transforms.base import Transform -class TimeFlagsTransform(Transform, FutureMixin, DymmyInColumnMixin): +class TimeFlagsTransform(Transform, FutureMixin): """TimeFlagsTransform is a class that implements extraction of the main time-based features from datetime column.""" def __init__( From 32a3ac8f6a3b299f8df952e465cf08ce85581063 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Fri, 5 Aug 2022 13:25:11 +0300 Subject: [PATCH 05/23] Fix required_features property --- etna/transforms/base.py | 13 +++++++++---- etna/transforms/feature_selection/base.py | 1 - etna/transforms/feature_selection/filter.py | 1 - etna/transforms/nn/pytorch_forecasting.py | 1 - 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/etna/transforms/base.py b/etna/transforms/base.py index 27ecbb3ef..b5185e894 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -19,7 +19,7 @@ class NewTransform(ABC, BaseMixin): """Base class to create any transforms to apply to data.""" def __init__(self): - in_column: Union[Literal["all"], List[str], str] = "target" + self.in_column: Union[Literal["all"], List[str], str] = "target" def get_regressors_info(self) -> List[str]: """Return the list with regressors created by the transform. @@ -35,9 +35,14 @@ def get_regressors_info(self) -> List[str]: def required_features(self) -> Union[Literal["all"], List[str]]: """Get the list of required features.""" required_features = self.in_column - if isinstance(required_features, str) and required_features != "all": - required_features = [required_features] - return required_features + if isinstance(required_features, list): + return required_features + elif isinstance(required_features, str): + if required_features == "all": + return "all" + return [required_features] + else: + raise ValueError("in_column attribute is in incorrect format!") def _update_dataset(self, ts: TSDataset, df: pd.DataFrame, df_transformed: pd.DataFrame) -> TSDataset: """Update TSDataset based on the difference between dfs.""" diff --git a/etna/transforms/feature_selection/base.py b/etna/transforms/feature_selection/base.py index fb628d526..bcbad776d 100644 --- a/etna/transforms/feature_selection/base.py +++ b/etna/transforms/feature_selection/base.py @@ -2,7 +2,6 @@ from abc import ABC from typing import List from typing import Optional -from typing import Sequence from typing import Union import pandas as pd diff --git a/etna/transforms/feature_selection/filter.py b/etna/transforms/feature_selection/filter.py index e13f7e3f0..4103b7b4d 100644 --- a/etna/transforms/feature_selection/filter.py +++ b/etna/transforms/feature_selection/filter.py @@ -9,7 +9,6 @@ class FilterFeaturesTransform(Transform): """Filters features in each segment of the dataframe.""" - def __init__( self, include: Optional[Sequence[str]] = None, diff --git a/etna/transforms/nn/pytorch_forecasting.py b/etna/transforms/nn/pytorch_forecasting.py index 9ae1ee3e3..f0f504ca2 100644 --- a/etna/transforms/nn/pytorch_forecasting.py +++ b/etna/transforms/nn/pytorch_forecasting.py @@ -35,7 +35,6 @@ class PytorchForecastingTransform(Transform): This transform should be added at the very end of ``transforms`` parameter. """ - def __init__( self, max_encoder_length: int = 30, From 8c583f655aa6d7c1968a7d23da4ec3b4e8953221 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Fri, 5 Aug 2022 14:48:48 +0300 Subject: [PATCH 06/23] Add tests --- etna/transforms/__init__.py | 1 + etna/transforms/base.py | 8 +- tests/test_transforms/test_base/test_base.py | 95 ++++++++++++++++++++ 3 files changed, 100 insertions(+), 4 deletions(-) create mode 100644 tests/test_transforms/test_base/test_base.py diff --git a/etna/transforms/__init__.py b/etna/transforms/__init__.py index 73b78efb9..d4764d5fb 100644 --- a/etna/transforms/__init__.py +++ b/etna/transforms/__init__.py @@ -1,3 +1,4 @@ +from etna.transforms.base import NewTransform from etna.transforms.base import PerSegmentWrapper from etna.transforms.base import Transform from etna.transforms.decomposition import BinsegTrendTransform diff --git a/etna/transforms/base.py b/etna/transforms/base.py index b5185e894..0a7617348 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -18,8 +18,8 @@ class FutureMixin: class NewTransform(ABC, BaseMixin): """Base class to create any transforms to apply to data.""" - def __init__(self): - self.in_column: Union[Literal["all"], List[str], str] = "target" + def __init__(self, in_column: Union[Literal["all"], List[str], str] = "target"): + self.in_column = in_column def get_regressors_info(self) -> List[str]: """Return the list with regressors created by the transform. @@ -46,8 +46,8 @@ def required_features(self) -> Union[Literal["all"], List[str]]: def _update_dataset(self, ts: TSDataset, df: pd.DataFrame, df_transformed: pd.DataFrame) -> TSDataset: """Update TSDataset based on the difference between dfs.""" - columns_before = set(df.columns.get_level_values("features")) - columns_after = set(df_transformed.columns.get_level_values("features")) + columns_before = set(df.columns.get_level_values("feature")) + columns_after = set(df_transformed.columns.get_level_values("feature")) # Transforms now can only remove or only add/update columns removed_features = list(columns_before - columns_after) diff --git a/tests/test_transforms/test_base/test_base.py b/tests/test_transforms/test_base/test_base.py new file mode 100644 index 000000000..02b1ea831 --- /dev/null +++ b/tests/test_transforms/test_base/test_base.py @@ -0,0 +1,95 @@ +from unittest.mock import Mock + +import pandas as pd +import pytest + +from etna.datasets import TSDataset +from etna.datasets import generate_ar_df +from etna.transforms import NewTransform + + +class NewTransformMock(NewTransform): + def _fit(self, df: pd.DataFrame) -> "NewTransform": + return self + + def _transform(self, df: pd.DataFrame) -> pd.DataFrame: + return df + + +@pytest.fixture +def remove_columns_df(): + df = generate_ar_df(periods=10, n_segments=3, start_time="2000-01-01") + df["exog_1"] = 1 + df = TSDataset.to_dataset(df) + + df_transformed = generate_ar_df(periods=10, n_segments=3, start_time="2000-01-01") + df_transformed = TSDataset.to_dataset(df_transformed) + return df, df_transformed + + +@pytest.mark.parametrize( + "in_column, expected_features", + [("all", "all"), ("target", ["target"]), (["target", "segment"], ["target", "segment"])], +) +def test_required_features(in_column, expected_features): + transform = NewTransformMock(in_column=in_column) + assert transform.required_features == expected_features + + +def test_update_dataset_remove_columns(remove_columns_df, expected_features_to_remove=["exog_1"]): + ts = Mock() + df, df_transformed = remove_columns_df + transform = NewTransformMock() + + transform._update_dataset(ts=ts, df=df, df_transformed=df_transformed) + ts.remove_features.assert_called_with(features=expected_features_to_remove) + + +def test_update_dataset_update_columns(remove_columns_df): + ts = Mock() + df_transformed, df = remove_columns_df + transform = NewTransformMock() + + transform._update_dataset(ts=ts, df=df, df_transformed=df_transformed) + ts.update_columns_from_pandas.assert_called_with(df=df_transformed, regressors=[]) + + +@pytest.mark.parametrize( + "in_column, required_features", + [("all", "all"), ("target", ["target"]), (["target", "segment"], ["target", "segment"])], +) +def test_fit_request_correct_columns(in_column, required_features): + ts = Mock() + transform = NewTransformMock(in_column=in_column) + + transform.fit(ts=ts) + ts.to_pandas.assert_called_with(flatten=False, features=required_features) + + +@pytest.mark.parametrize( + "in_column, required_features", + [("all", "all"), ("target", ["target"]), (["target", "segment"], ["target", "segment"])], +) +def test_transform_request_correct_columns(in_column, required_features): + ts = Mock() + transform = NewTransformMock(in_column=in_column) + transform._update_dataset = Mock() + + transform.transform(ts=ts) + ts.to_pandas.assert_called_with(flatten=False, features=required_features) + + +@pytest.mark.parametrize( + "in_column, required_features", + [("all", "all"), ("target", ["target"]), (["target", "segment"], ["target", "segment"])], +) +def test_transform_request_update_dataset(remove_columns_df, in_column, required_features): + df, _ = remove_columns_df + ts = TSDataset(df=df, freq="D") + ts.to_pandas = Mock(return_value=df) + + transform = NewTransformMock(in_column=in_column) + transform._update_dataset = Mock() + + transform.transform(ts=ts) + transform._update_dataset.assert_called_with(ts=ts, df=df, df_transformed=df) From 7ef3e2187a9e24ddcafa176325123aae35a0af62 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Fri, 5 Aug 2022 14:51:16 +0300 Subject: [PATCH 07/23] Update Changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b4615115..858f4e73e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - - -- +- Base class for transforms now works with TSDataset ([835](https://github.com/tinkoff-ai/etna/pull/835)) - All the transforms now has `in_column` attribute ([#820](https://github.com/tinkoff-ai/etna/pull/820)) - - From 04fd8103ac7bebacabfabeff9eb7cfc4b6fa1d0f Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Wed, 10 Aug 2022 12:20:23 +0300 Subject: [PATCH 08/23] Add drop_exog flag --- etna/datasets/tsdataset.py | 18 +++++++++++------- etna/models/nn/deepstate/deepstate.py | 0 2 files changed, 11 insertions(+), 7 deletions(-) create mode 100644 etna/models/nn/deepstate/deepstate.py diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index ee9cba93b..38193f765 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -900,19 +900,23 @@ def update_columns_from_pandas(self, df: pd.DataFrame, regressors: Optional[List if regressors is not None: self._regressors = list(set(self._regressors) | set(regressors)) - def remove_features(self, features: List[str]): - """Remove columns from the dataset. + def drop_features(self, features: List[str], drop_from_exog: bool = False): + """Drop columns with features from the dataset. - Columns that are not presented in the dataset will be ignored + Columns that are not presented in the dataset will be ignored. Parameters ---------- features: - List of features to be removed + List of features to drop. + drop_from_exog: + If False, drop features only from df. Features will appear again in df after make_future. + If True, drop features df and df_exog. Features will not appear in df after make_future. """ - columns_in_df = self.df.columns.get_level_values("feature") - columns_to_remove = list(set(columns_in_df) & set(features)) - self.df.drop(columns=columns_to_remove, level="feature", inplace=True) + for df in [self.df, self.df_exog, self.raw_df]: + columns_in_df = df.columns.get_level_values("feature") + columns_to_remove = list(set(columns_in_df) & set(features)) + df.drop(columns=columns_to_remove, level="feature", inplace=True) self._regressors = list(set(self._regressors) - set(features)) @property diff --git a/etna/models/nn/deepstate/deepstate.py b/etna/models/nn/deepstate/deepstate.py new file mode 100644 index 000000000..e69de29bb From 549efca22fa8f5b5153fc080349987d4b905cae0 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Fri, 12 Aug 2022 11:44:56 +0300 Subject: [PATCH 09/23] Fix methods in tsdataset --- etna/datasets/tsdataset.py | 58 +++++++++----- tests/test_datasets/test_dataset.py | 112 ++++++++++++++++++++-------- 2 files changed, 121 insertions(+), 49 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 38193f765..3b3fa7c21 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -871,31 +871,46 @@ def train_test_split( return train, test - def update_columns_from_pandas(self, df: pd.DataFrame, regressors: Optional[List[str]] = None): + def _update_columns(self, df: pd.DataFrame, df_index: Union[pd.Index, slice], df_update: pd.DataFrame) -> pd.DataFrame: + columns_in_df = df.columns.get_level_values("feature") + new_columns = df_update.columns.get_level_values("feature") + + columns_to_update = list(set(columns_in_df) & set(new_columns)) + if len(columns_to_update) != 0: + df.loc[df_index, self.idx[self.segments, columns_to_update]] = df_update.loc[ + df_index, self.idx[self.segments, columns_to_update] + ] + + columns_to_add = list(set(new_columns) - set(columns_in_df)) + if len(columns_to_add) != 0: + df = pd.concat((df, df_update.loc[df_index, self.idx[:, columns_to_add]]), axis=1).sort_index(axis=1) + return df + + def update_columns_from_pandas( + self, df_update: pd.DataFrame, update_exog=False, regressors: Optional[List[str]] = None + ): """Update the dataset with the new columns from pandas dataframe. - It is recommended to add the exogenous regressor using the constructor of the TSDataset. - This method does not add the regressors as exogenous data in df_exog, but only update the df attribute. + Before updating columns in df, columns of df_update will be cropped by the last timestamp in df. + Columns in df_exog are updated without cropping. Parameters ---------- - df: + df_update: Dataframe with the new columns in wide ETNA format. If columns with the same names already exist in the dataset, then values will be updated. + update_exog: + If True, update columns also in df_exog. + If columns that you wish to update present both in df and df_exog, it is recommended to turn on this flag. regressors: List of regressors in the passed dataframe """ - columns_in_dataset = self.columns.get_level_values("feature") - new_columns = df.columns.get_level_values("feature") - - columns_to_update = list(set(columns_in_dataset) & set(new_columns)) - self.df.loc[:, self.idx[self.segments, columns_to_update]] = df.loc[ - :, self.idx[self.segments, columns_to_update] - ] - - columns_to_add = list(set(new_columns) - set(columns_in_dataset)) - if len(columns_to_add) != 0: - self.df = pd.concat((self.df, df.loc[:, self.idx[:, columns_to_add]]), axis=1).sort_index(axis=1) + self.df = self._update_columns(df=self.df, df_index=self.index, df_update=df_update) + if update_exog: + if self.df_exog is None: + self.df_exog = df_update + else: + self.df_exog = self._update_columns(df=self.df_exog, df_index=slice(None), df_update=df_update) if regressors is not None: self._regressors = list(set(self._regressors) | set(regressors)) @@ -903,19 +918,24 @@ def update_columns_from_pandas(self, df: pd.DataFrame, regressors: Optional[List def drop_features(self, features: List[str], drop_from_exog: bool = False): """Drop columns with features from the dataset. - Columns that are not presented in the dataset will be ignored. - Parameters ---------- features: List of features to drop. drop_from_exog: If False, drop features only from df. Features will appear again in df after make_future. - If True, drop features df and df_exog. Features will not appear in df after make_future. + If True, drop features from df and df_exog. Features won't appear in df after make_future. """ - for df in [self.df, self.df_exog, self.raw_df]: + dfs = [("df", self.df)] + if drop_from_exog: + dfs.append(("df_exog", self.df_exog)) + + for name, df in dfs: columns_in_df = df.columns.get_level_values("feature") columns_to_remove = list(set(columns_in_df) & set(features)) + unknown_columns = set(features) - set(columns_to_remove) + if len(unknown_columns) != 0: + warnings.warn(f"Features {unknown_columns} are not present in {name}!") df.drop(columns=columns_to_remove, level="feature", inplace=True) self._regressors = list(set(self._regressors) - set(features)) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 29a4c1e30..140f8c455 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -55,6 +55,38 @@ def df_and_regressors() -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]: return df, df_exog, ["regressor_1", "regressor_2"] +@pytest.fixture() +def df_update() -> pd.DataFrame: + timestamp = pd.date_range("2021-01-01", "2021-02-12") + df_1 = pd.DataFrame({"timestamp": timestamp, "new_column": 100, "regressor_1": 10, "segment": "1"}) + df_2 = pd.DataFrame({"timestamp": timestamp[5:], "new_column": 200, "regressor_1": 20, "segment": "2"}) + df = pd.concat([df_1, df_2], ignore_index=True) + df = TSDataset.to_dataset(df) + return df + +@pytest.fixture() +def dfs_updated(df_and_regressors, df_update) -> Tuple[pd.DataFrame,pd.DataFrame]: + df, df_exog, _ = df_and_regressors + + df = TSDataset.to_flatten(df) + df["new_column"] = -1 + df.loc[df["segment"] == "1", "new_column"] = 100 + df.loc[df["segment"] == "1", "regressor_1"] = 10 + df.loc[df["segment"] == "2", "new_column"] = 200 + df.loc[df["segment"] == "2", "regressor_1"] = 20 + df = TSDataset.to_dataset(df) + + df_exog = df_exog.reindex(df_update.index) + df_exog = TSDataset.to_flatten(df_exog) + df_exog["new_column"] = -1 + df_exog.loc[df_exog["segment"] == "1", "new_column"] = 100 + df_exog.loc[df_exog["segment"] == "1", "regressor_1"] = 10 + df_exog.loc[df_exog["segment"] == "2", "new_column"] = 200 + df_exog.loc[df_exog["segment"] == "2", "regressor_1"] = 20 + df_exog = TSDataset.to_dataset(df_exog) + + return df, df_exog + @pytest.fixture() def df_and_regressors_flat() -> Tuple[pd.DataFrame, pd.DataFrame]: @@ -117,6 +149,7 @@ def df_segments_int(): return df +''' def test_check_endings_error(): """Check that _check_endings method raises exception if some segments end with nan.""" timestamp = pd.date_range("2021-01-01", "2021-02-01") @@ -917,35 +950,24 @@ def make_samples(df): np.testing.assert_array_equal( torch_dataset[1]["target"], tsdf_with_exog.df.loc[:, pd.IndexSlice["Omsk", "target"]].values ) - +''' @pytest.mark.parametrize( - "expected_columns", - ((["target", "regressor_1", "regressor_2"]),), + "update_exog", (True, False) ) -def test_update_columns_update_existing_columns(df_and_regressors, expected_columns): +def test_update_columns_from_pandas(df_and_regressors, df_update, dfs_updated, update_exog): df, df_exog, known_future = df_and_regressors + df_updated, df_exog_updated = dfs_updated + if not update_exog: + df_exog_updated = df_exog ts = TSDataset(df=df, freq="D", df_exog=df_exog, known_future=known_future) - df_exog.loc[:, pd.IndexSlice[:, "regressor_1"]] = 0 - ts.update_columns_from_pandas(df=df_exog, regressors=known_future) + ts.update_columns_from_pandas(df_update=df_update, update_exog=update_exog, regressors=known_future) - got_columns = set(ts.columns.get_level_values("feature")) - assert sorted(got_columns) == sorted(expected_columns) - assert (ts[:, :, "regressor_1"] == 0).all().all() - - -@pytest.mark.parametrize( - "expected_columns", - ((["target", "regressor_1", "regressor_2"]),), -) -def test_update_columns_add_columns(df_and_regressors, expected_columns): - df, df_exog, _ = df_and_regressors - ts = TSDataset(df=df, freq="D") - ts.update_columns_from_pandas(df=df_exog, regressors=[]) - got_columns = set(ts.columns.get_level_values("feature")) - assert sorted(got_columns) == sorted(expected_columns) + pd.testing.assert_frame_equal(ts.df, df_updated) + pd.testing.assert_frame_equal(ts.df_exog, df_exog_updated) +''' @pytest.mark.parametrize( "known_future, regressors, expected_regressors", @@ -961,19 +983,47 @@ def test_update_columns_update_regressors(df_and_regressors, known_future, regre assert sorted(ts.regressors) == sorted(expected_regressors) + + @pytest.mark.parametrize( - "features, expected_columns", + "features, drop_from_exog, df_expected_columns, df_exog_expected_columns", ( - (["regressor_2"], ["timestamp", "segment", "target", "regressor_1"]), - (["regressor_2", "out_of_dataset_column"], ["timestamp", "segment", "target", "regressor_1"]), + ( + ["regressor_2"], + False, + ["timestamp", "segment", "target", "regressor_1"], + ["timestamp", "segment", "regressor_1", "regressor_2"], + ), + ( + ["regressor_2"], + True, + ["timestamp", "segment", "target", "regressor_1"], + ["timestamp", "segment", "regressor_1"], + ), + ( + ["regressor_2", "out_of_dataset_column"], + True, + ["timestamp", "segment", "target", "regressor_1"], + ["timestamp", "segment", "regressor_1"], + ), ), ) -def test_remove_features(df_and_regressors, features, expected_columns): +def test_drop_features(df_and_regressors, features, drop_from_exog, df_expected_columns, df_exog_expected_columns): df, df_exog, known_future = df_and_regressors ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) - ts.remove_features(features=features) - got_columns = ts.to_flatten(ts.df).columns - assert sorted(got_columns) == sorted(expected_columns) + ts.drop_features(features=features, drop_from_exog=drop_from_exog) + df_columns, df_exog_columns = ts.to_flatten(ts.df).columns, ts.to_flatten(ts.df_exog).columns + assert sorted(df_columns) == sorted(df_expected_columns) + assert sorted(df_exog_columns) == sorted(df_exog_expected_columns) + + +def test_drop_features_raise_warning_on_unknown_columns( + df_and_regressors, features=["regressor_2", "out_of_dataset_column"] +): + df, df_exog, known_future = df_and_regressors + ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) + with pytest.warns(UserWarning, match="Features {'out_of_dataset_column'} are not present in df!"): + ts.drop_features(features=features, drop_from_exog=False) @pytest.mark.parametrize( @@ -983,8 +1033,10 @@ def test_remove_features(df_and_regressors, features, expected_columns): (["out_of_dataset_column"], ["regressor_1", "regressor_2"]), ), ) -def test_remove_features_update_regressors(df_and_regressors, features, expected_regressors): +def test_drop_features_update_regressors(df_and_regressors, features, expected_regressors): df, df_exog, known_future = df_and_regressors ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) - ts.remove_features(features=features) + ts.drop_features(features=features, drop_from_exog=False) assert sorted(ts.regressors) == sorted(expected_regressors) + +''' From ef5df42fcdcc569e86f34d332a2a1c23cbc7a620 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Mon, 15 Aug 2022 09:53:55 +0300 Subject: [PATCH 10/23] Add common logic to inverse_transform --- etna/transforms/base.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/etna/transforms/base.py b/etna/transforms/base.py index 0a7617348..3f241a4ee 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -52,14 +52,14 @@ def _update_dataset(self, ts: TSDataset, df: pd.DataFrame, df_transformed: pd.Da # Transforms now can only remove or only add/update columns removed_features = list(columns_before - columns_after) if len(removed_features) != 0: - ts.remove_features(features=removed_features) + ts.drop_features(features=removed_features, drop_from_exog=False) else: new_regressors = self.get_regressors_info() - ts.update_columns_from_pandas(df=df_transformed, regressors=new_regressors) + ts.update_columns_from_pandas(df_update=df_transformed, update_exog=False, regressors=new_regressors) return ts @abstractmethod - def _fit(self, df: pd.DataFrame) -> "NewTransform": + def _fit(self, df: pd.DataFrame): """Fit the transform. Should be implemented by user. @@ -68,11 +68,6 @@ def _fit(self, df: pd.DataFrame) -> "NewTransform": ---------- df: Dataframe in etna wide format. - - Returns - ------- - : - The fitted transform instance. """ pass @@ -176,6 +171,10 @@ def inverse_transform(self, ts: TSDataset) -> TSDataset: : TSDataset after applying inverse transformation. """ + df = ts.to_pandas(flatten=False, features="all") + df_transformed = self._inverse_transform(df=df) + if not df.equals(df_transformed): + ts = self._update_dataset(ts=ts, df=df, df_transformed=df_transformed) return ts From b49185a6cfe54f71be8078824cf6316438d49703 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Mon, 15 Aug 2022 09:54:20 +0300 Subject: [PATCH 11/23] Add tests for base transform --- tests/test_transforms/test_base/test_base.py | 47 +++++++++++++++++--- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/tests/test_transforms/test_base/test_base.py b/tests/test_transforms/test_base/test_base.py index 02b1ea831..a96acab91 100644 --- a/tests/test_transforms/test_base/test_base.py +++ b/tests/test_transforms/test_base/test_base.py @@ -1,3 +1,4 @@ +from typing import List from unittest.mock import Mock import pandas as pd @@ -9,12 +10,20 @@ class NewTransformMock(NewTransform): - def _fit(self, df: pd.DataFrame) -> "NewTransform": - return self + def get_regressors_info(self) -> List[str]: + return ["regressor_test"] + + def _fit(self, df: pd.DataFrame): + pass def _transform(self, df: pd.DataFrame) -> pd.DataFrame: return df + def _inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: + df = df.copy() + df["target"] = -100 + return df + @pytest.fixture def remove_columns_df(): @@ -36,13 +45,16 @@ def test_required_features(in_column, expected_features): assert transform.required_features == expected_features -def test_update_dataset_remove_columns(remove_columns_df, expected_features_to_remove=["exog_1"]): +def test_update_dataset_remove_columns(remove_columns_df): ts = Mock() df, df_transformed = remove_columns_df + expected_features_to_remove = list( + set(df.columns.get_level_values("feature")) - set(df_transformed.columns.get_level_values("feature")) + ) transform = NewTransformMock() transform._update_dataset(ts=ts, df=df, df_transformed=df_transformed) - ts.remove_features.assert_called_with(features=expected_features_to_remove) + ts.drop_features.assert_called_with(features=expected_features_to_remove, drop_from_exog=False) def test_update_dataset_update_columns(remove_columns_df): @@ -51,7 +63,9 @@ def test_update_dataset_update_columns(remove_columns_df): transform = NewTransformMock() transform._update_dataset(ts=ts, df=df, df_transformed=df_transformed) - ts.update_columns_from_pandas.assert_called_with(df=df_transformed, regressors=[]) + ts.update_columns_from_pandas.assert_called_with( + df_update=df_transformed, update_exog=False, regressors=transform.get_regressors_info() + ) @pytest.mark.parametrize( @@ -93,3 +107,26 @@ def test_transform_request_update_dataset(remove_columns_df, in_column, required transform.transform(ts=ts) transform._update_dataset.assert_called_with(ts=ts, df=df, df_transformed=df) + + +def test_inverse_transform_update_dataset(remove_columns_df): + df, _ = remove_columns_df + ts = TSDataset(df=df, freq="D") + + transform = NewTransformMock() + transform._update_dataset = Mock() + + transform.inverse_transform(ts=ts) + transform._update_dataset.assert_called() + + +def test_inverse_transform_not_update_dataset_if_not_transformed(remove_columns_df): + df, _ = remove_columns_df + ts = TSDataset(df=df, freq="D") + + transform = NewTransformMock() + transform._update_dataset = Mock() + transform._inverse_transform = Mock(return_value=df) + + transform.inverse_transform(ts=ts) + assert not transform._update_dataset.called From efa244751f6e57cc9ff2b140b71b6d9f1eba3e46 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Mon, 15 Aug 2022 10:21:21 +0300 Subject: [PATCH 12/23] Update dataset methods --- etna/datasets/tsdataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 3b3fa7c21..18414564d 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -871,7 +871,9 @@ def train_test_split( return train, test - def _update_columns(self, df: pd.DataFrame, df_index: Union[pd.Index, slice], df_update: pd.DataFrame) -> pd.DataFrame: + def _update_columns( + self, df: pd.DataFrame, df_index: Union[pd.Index, slice], df_update: pd.DataFrame + ) -> pd.DataFrame: columns_in_df = df.columns.get_level_values("feature") new_columns = df_update.columns.get_level_values("feature") From 8070a75e1ec7f0c182b4254a3a53592e42e2d65a Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Mon, 15 Aug 2022 10:53:25 +0300 Subject: [PATCH 13/23] Comment unfinished tests --- tests/test_datasets/test_dataset.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 140f8c455..126ed8a44 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -55,6 +55,7 @@ def df_and_regressors() -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]: return df, df_exog, ["regressor_1", "regressor_2"] + @pytest.fixture() def df_update() -> pd.DataFrame: timestamp = pd.date_range("2021-01-01", "2021-02-12") @@ -64,21 +65,22 @@ def df_update() -> pd.DataFrame: df = TSDataset.to_dataset(df) return df + @pytest.fixture() -def dfs_updated(df_and_regressors, df_update) -> Tuple[pd.DataFrame,pd.DataFrame]: +def df_and_regressors_updated(df_and_regressors, df_update) -> Tuple[pd.DataFrame, pd.DataFrame]: df, df_exog, _ = df_and_regressors df = TSDataset.to_flatten(df) - df["new_column"] = -1 df.loc[df["segment"] == "1", "new_column"] = 100 df.loc[df["segment"] == "1", "regressor_1"] = 10 + df.loc[df["segment"] == "1", "regressor_2"] = 2 df.loc[df["segment"] == "2", "new_column"] = 200 df.loc[df["segment"] == "2", "regressor_1"] = 20 + df.loc[df["segment"] == "2", "regressor_2"] = 4 df = TSDataset.to_dataset(df) df_exog = df_exog.reindex(df_update.index) df_exog = TSDataset.to_flatten(df_exog) - df_exog["new_column"] = -1 df_exog.loc[df_exog["segment"] == "1", "new_column"] = 100 df_exog.loc[df_exog["segment"] == "1", "regressor_1"] = 10 df_exog.loc[df_exog["segment"] == "2", "new_column"] = 200 @@ -149,7 +151,6 @@ def df_segments_int(): return df -''' def test_check_endings_error(): """Check that _check_endings method raises exception if some segments end with nan.""" timestamp = pd.date_range("2021-01-01", "2021-02-01") @@ -950,14 +951,13 @@ def make_samples(df): np.testing.assert_array_equal( torch_dataset[1]["target"], tsdf_with_exog.df.loc[:, pd.IndexSlice["Omsk", "target"]].values ) -''' -@pytest.mark.parametrize( - "update_exog", (True, False) -) -def test_update_columns_from_pandas(df_and_regressors, df_update, dfs_updated, update_exog): + +""" +@pytest.mark.parametrize("update_exog", (True, False)) +def test_update_columns_from_pandas(df_and_regressors, df_update, df_and_regressors_updated, update_exog): df, df_exog, known_future = df_and_regressors - df_updated, df_exog_updated = dfs_updated + df_updated, df_exog_updated = df_and_regressors_updated if not update_exog: df_exog_updated = df_exog ts = TSDataset(df=df, freq="D", df_exog=df_exog, known_future=known_future) @@ -967,7 +967,6 @@ def test_update_columns_from_pandas(df_and_regressors, df_update, dfs_updated, u pd.testing.assert_frame_equal(ts.df, df_updated) pd.testing.assert_frame_equal(ts.df_exog, df_exog_updated) -''' @pytest.mark.parametrize( "known_future, regressors, expected_regressors", @@ -981,8 +980,7 @@ def test_update_columns_update_regressors(df_and_regressors, known_future, regre ts = TSDataset(df=df, freq="D", df_exog=df_exog, known_future=known_future) ts.update_columns_from_pandas(df=df_exog, regressors=regressors) assert sorted(ts.regressors) == sorted(expected_regressors) - - +""" @pytest.mark.parametrize( @@ -1038,5 +1036,3 @@ def test_drop_features_update_regressors(df_and_regressors, features, expected_r ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) ts.drop_features(features=features, drop_from_exog=False) assert sorted(ts.regressors) == sorted(expected_regressors) - -''' From 8ec19da3afd3116b0c9ebae16f230ec7b5bbc79a Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Tue, 16 Aug 2022 09:14:13 +0300 Subject: [PATCH 14/23] Add override decorator --- etna/transforms/base.py | 12 +++++++++--- tests/test_transforms/test_base/test_base.py | 6 ++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/etna/transforms/base.py b/etna/transforms/base.py index 3f241a4ee..da94d74de 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -15,6 +15,12 @@ class FutureMixin: """Mixin for transforms that can convert non-regressor column to a regressor one.""" +def override(method): + """Indicate the fact of overriding the method.""" + method.is_overridden = True + return method + + class NewTransform(ABC, BaseMixin): """Base class to create any transforms to apply to data.""" @@ -171,9 +177,9 @@ def inverse_transform(self, ts: TSDataset) -> TSDataset: : TSDataset after applying inverse transformation. """ - df = ts.to_pandas(flatten=False, features="all") - df_transformed = self._inverse_transform(df=df) - if not df.equals(df_transformed): + if hasattr(self._inverse_transform, "is_overridden"): + df = ts.to_pandas(flatten=False, features="all") + df_transformed = self._inverse_transform(df=df) ts = self._update_dataset(ts=ts, df=df, df_transformed=df_transformed) return ts diff --git a/tests/test_transforms/test_base/test_base.py b/tests/test_transforms/test_base/test_base.py index a96acab91..02993e2ee 100644 --- a/tests/test_transforms/test_base/test_base.py +++ b/tests/test_transforms/test_base/test_base.py @@ -7,6 +7,7 @@ from etna.datasets import TSDataset from etna.datasets import generate_ar_df from etna.transforms import NewTransform +from etna.transforms.base import override class NewTransformMock(NewTransform): @@ -19,6 +20,7 @@ def _fit(self, df: pd.DataFrame): def _transform(self, df: pd.DataFrame) -> pd.DataFrame: return df + @override def _inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: df = df.copy() df["target"] = -100 @@ -120,13 +122,13 @@ def test_inverse_transform_update_dataset(remove_columns_df): transform._update_dataset.assert_called() -def test_inverse_transform_not_update_dataset_if_not_transformed(remove_columns_df): +def test_inverse_transform_not_update_dataset_if_not_overriden(remove_columns_df): df, _ = remove_columns_df ts = TSDataset(df=df, freq="D") transform = NewTransformMock() transform._update_dataset = Mock() - transform._inverse_transform = Mock(return_value=df) + transform._inverse_transform = lambda df: df transform.inverse_transform(ts=ts) assert not transform._update_dataset.called From f4c91bde66acecc65f70d70942e70c137922bd4b Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Tue, 16 Aug 2022 09:15:39 +0300 Subject: [PATCH 15/23] Remove unnesesary file --- etna/models/nn/deepstate/deepstate.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 etna/models/nn/deepstate/deepstate.py diff --git a/etna/models/nn/deepstate/deepstate.py b/etna/models/nn/deepstate/deepstate.py deleted file mode 100644 index e69de29bb..000000000 From 2b5f276b5fed15570352be6c3437ba0f2010c8a1 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Wed, 17 Aug 2022 10:29:41 +0300 Subject: [PATCH 16/23] Separate method for updating/adding columns --- etna/datasets/tsdataset.py | 49 ++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 18414564d..e4608eaaf 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -871,49 +871,46 @@ def train_test_split( return train, test - def _update_columns( - self, df: pd.DataFrame, df_index: Union[pd.Index, slice], df_update: pd.DataFrame - ) -> pd.DataFrame: - columns_in_df = df.columns.get_level_values("feature") - new_columns = df_update.columns.get_level_values("feature") - - columns_to_update = list(set(columns_in_df) & set(new_columns)) - if len(columns_to_update) != 0: - df.loc[df_index, self.idx[self.segments, columns_to_update]] = df_update.loc[ - df_index, self.idx[self.segments, columns_to_update] - ] - - columns_to_add = list(set(new_columns) - set(columns_in_df)) - if len(columns_to_add) != 0: - df = pd.concat((df, df_update.loc[df_index, self.idx[:, columns_to_add]]), axis=1).sort_index(axis=1) - return df + def update_columns_from_pandas(self, df_update: pd.DataFrame): + """Update the existing columns in the dataset with the new values from pandas dataframe. + + Before updating columns in df, columns of df_update will be cropped by the last timestamp in df. + If columns in df_exog are not updated. If you wish to update the df_exog, create the new + instance of TSDataset. + + Parameters + ---------- + df_update: + Dataframe with new values in wide ETNA format. + """ + columns_to_update = sorted(set(df_update.columns.get_level_values("feature"))) + self.df.loc[:, self.idx[self.segments, columns_to_update]] = df_update.loc[ + : self.df.index.max(), self.idx[self.segments, columns_to_update] + ] - def update_columns_from_pandas( - self, df_update: pd.DataFrame, update_exog=False, regressors: Optional[List[str]] = None + def add_columns_from_pandas( + self, df_update: pd.DataFrame, update_exog: bool = False, regressors: Optional[List[str]] = None ): """Update the dataset with the new columns from pandas dataframe. Before updating columns in df, columns of df_update will be cropped by the last timestamp in df. - Columns in df_exog are updated without cropping. Parameters ---------- df_update: Dataframe with the new columns in wide ETNA format. - If columns with the same names already exist in the dataset, then values will be updated. update_exog: - If True, update columns also in df_exog. - If columns that you wish to update present both in df and df_exog, it is recommended to turn on this flag. + If True, update columns also in df_exog. + If you wish to add new regressors in the dataset it is recommended to turn on this flag. regressors: - List of regressors in the passed dataframe + List of regressors in the passed dataframe. """ - self.df = self._update_columns(df=self.df, df_index=self.index, df_update=df_update) + self.df = pd.concat((self.df, df_update[: self.df.index.max()]), axis=1).sort_index(axis=1) if update_exog: if self.df_exog is None: self.df_exog = df_update else: - self.df_exog = self._update_columns(df=self.df_exog, df_index=slice(None), df_update=df_update) - + self.df_exog = pd.concat((self.df_exog, df_update), axis=1).sort_index(axis=1) if regressors is not None: self._regressors = list(set(self._regressors) | set(regressors)) From c32cbce45fe5918decda6b3b1a5ff0aea79247a1 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Wed, 17 Aug 2022 10:30:15 +0300 Subject: [PATCH 17/23] Update tests for dataset --- tests/test_datasets/test_dataset.py | 93 +++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 17 deletions(-) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 126ed8a44..b7c14735d 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -57,15 +57,66 @@ def df_and_regressors() -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]: @pytest.fixture() -def df_update() -> pd.DataFrame: +def df_update_add_column() -> pd.DataFrame: timestamp = pd.date_range("2021-01-01", "2021-02-12") - df_1 = pd.DataFrame({"timestamp": timestamp, "new_column": 100, "regressor_1": 10, "segment": "1"}) - df_2 = pd.DataFrame({"timestamp": timestamp[5:], "new_column": 200, "regressor_1": 20, "segment": "2"}) + df_1 = pd.DataFrame({"timestamp": timestamp, "new_column": 100, "segment": "1"}) + df_2 = pd.DataFrame({"timestamp": timestamp, "new_column": 200, "segment": "2"}) df = pd.concat([df_1, df_2], ignore_index=True) df = TSDataset.to_dataset(df) return df +@pytest.fixture() +def df_update_update_column() -> pd.DataFrame: + timestamp = pd.date_range("2021-01-01", "2021-02-12") + df_1 = pd.DataFrame({"timestamp": timestamp, "target": 100, "segment": "1"}) + df_2 = pd.DataFrame({"timestamp": timestamp, "target": 200, "segment": "2"}) + df = pd.concat([df_1, df_2], ignore_index=True) + df = TSDataset.to_dataset(df) + return df + + +@pytest.fixture() +def df_updated_add_column() -> pd.DataFrame: + timestamp = pd.date_range("2021-01-01", "2021-02-01") + df_1 = pd.DataFrame({"timestamp": timestamp, "target": 11, "new_column": 100, "segment": "1"}) + df_2 = pd.DataFrame({"timestamp": timestamp, "target": 12, "new_column": 200, "segment": "2"}) + df_2.loc[:4, "target"] = None + df = pd.concat([df_1, df_2], ignore_index=True) + df = TSDataset(df=TSDataset.to_dataset(df), freq="D").df + return df + + +@pytest.fixture() +def df_updated_update_column() -> pd.DataFrame: + timestamp = pd.date_range("2021-01-01", "2021-02-01") + df_1 = pd.DataFrame({"timestamp": timestamp, "target": 100, "segment": "1"}) + df_2 = pd.DataFrame({"timestamp": timestamp, "target": 200, "segment": "2"}) + df = pd.concat([df_1, df_2], ignore_index=True) + df = TSDataset(df=TSDataset.to_dataset(df), freq="D").df + return df + + +@pytest.fixture() +def df_exog_updated_add_column() -> pd.DataFrame: + timestamp = pd.date_range("2020-12-01", "2021-02-12") + df_1 = pd.DataFrame({"timestamp": timestamp, "regressor_1": 1, "regressor_2": 2, "new_column": 100, "segment": "1"}) + df_1.iloc[-1:, df_1.columns.get_loc("regressor_1")] = None + df_1.iloc[-1:, df_1.columns.get_loc("regressor_2")] = None + df_1.iloc[:31, df_1.columns.get_loc("new_column")] = None + df_2 = pd.DataFrame({"timestamp": timestamp, "regressor_1": 3, "regressor_2": 4, "new_column": 200, "segment": "2"}) + df_2.iloc[:5, df_2.columns.get_loc("regressor_1")] = None + df_2.iloc[:5, df_2.columns.get_loc("regressor_2")] = None + df_2.iloc[-1:, df_2.columns.get_loc("regressor_1")] = None + df_2.iloc[-1:, df_2.columns.get_loc("regressor_2")] = None + df_2.iloc[:31, df_2.columns.get_loc("new_column")] = None + df_exog = pd.concat([df_1, df_2], ignore_index=True) + df_exog = TSDataset.to_dataset(df_exog) + df_exog = TSDataset(df=df_exog, freq="D").df + return df_exog + + +""" @pytest.fixture() def df_and_regressors_updated(df_and_regressors, df_update) -> Tuple[pd.DataFrame, pd.DataFrame]: df, df_exog, _ = df_and_regressors @@ -88,6 +139,7 @@ def df_and_regressors_updated(df_and_regressors, df_update) -> Tuple[pd.DataFram df_exog = TSDataset.to_dataset(df_exog) return df, df_exog +""" @pytest.fixture() @@ -953,19 +1005,18 @@ def make_samples(df): ) -""" -@pytest.mark.parametrize("update_exog", (True, False)) -def test_update_columns_from_pandas(df_and_regressors, df_update, df_and_regressors_updated, update_exog): - df, df_exog, known_future = df_and_regressors - df_updated, df_exog_updated = df_and_regressors_updated - if not update_exog: - df_exog_updated = df_exog - ts = TSDataset(df=df, freq="D", df_exog=df_exog, known_future=known_future) +def test_add_columns_from_pandas_update_df(df_and_regressors, df_update_add_column, df_updated_add_column): + df, _, _ = df_and_regressors + ts = TSDataset(df=df, freq="D") + ts.add_columns_from_pandas(df_update=df_update_add_column, update_exog=False) + pd.testing.assert_frame_equal(ts.df, df_updated_add_column) - ts.update_columns_from_pandas(df_update=df_update, update_exog=update_exog, regressors=known_future) - pd.testing.assert_frame_equal(ts.df, df_updated) - pd.testing.assert_frame_equal(ts.df_exog, df_exog_updated) +def test_add_columns_from_pandas_update_df_exog(df_and_regressors, df_update_add_column, df_exog_updated_add_column): + df, df_exog, _ = df_and_regressors + ts = TSDataset(df=df, freq="D", df_exog=df_exog) + ts.add_columns_from_pandas(df_update=df_update_add_column, update_exog=True) + pd.testing.assert_frame_equal(ts.df_exog, df_exog_updated_add_column, check_dtype=False) @pytest.mark.parametrize( @@ -975,12 +1026,20 @@ def test_update_columns_from_pandas(df_and_regressors, df_update, df_and_regress (["regressor_1"], ["regressor_1", "regressor_2"], ["regressor_1", "regressor_2"]), ), ) -def test_update_columns_update_regressors(df_and_regressors, known_future, regressors, expected_regressors): +def test_add_columns_from_pandas_update_regressors( + df_and_regressors, df_update_add_column, known_future, regressors, expected_regressors +): df, df_exog, _ = df_and_regressors ts = TSDataset(df=df, freq="D", df_exog=df_exog, known_future=known_future) - ts.update_columns_from_pandas(df=df_exog, regressors=regressors) + ts.add_columns_from_pandas(df_update=df_update_add_column, update_exog=True, regressors=regressors) assert sorted(ts.regressors) == sorted(expected_regressors) -""" + + +def test_update_columns_from_pandas(df_and_regressors, df_update_update_column, df_updated_update_column): + df, _, _ = df_and_regressors + ts = TSDataset(df=df, freq="D") + ts.update_columns_from_pandas(df_update=df_update_update_column) + pd.testing.assert_frame_equal(ts.df, df_updated_update_column) @pytest.mark.parametrize( From 41d1982d181db4a37a5006680b4f1b8273a89c81 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Wed, 17 Aug 2022 10:30:32 +0300 Subject: [PATCH 18/23] Update base class --- etna/transforms/base.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/etna/transforms/base.py b/etna/transforms/base.py index da94d74de..cf4294765 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -54,14 +54,23 @@ def _update_dataset(self, ts: TSDataset, df: pd.DataFrame, df_transformed: pd.Da """Update TSDataset based on the difference between dfs.""" columns_before = set(df.columns.get_level_values("feature")) columns_after = set(df_transformed.columns.get_level_values("feature")) + columns_to_update = list(set(columns_before) & set(columns_after)) + columns_to_add = list(set(columns_after) - set(columns_before)) + columns_to_remove = list(set(columns_before) - set(columns_after)) - # Transforms now can only remove or only add/update columns - removed_features = list(columns_before - columns_after) - if len(removed_features) != 0: - ts.drop_features(features=removed_features, drop_from_exog=False) - else: + if len(columns_to_remove) != 0: + ts.drop_features(features=columns_to_remove, drop_from_exog=False) + if len(columns_to_add) != 0: new_regressors = self.get_regressors_info() - ts.update_columns_from_pandas(df_update=df_transformed, update_exog=False, regressors=new_regressors) + ts.add_columns_from_pandas( + df_update=df_transformed.loc[pd.IndexSlice[:], pd.IndexSlice[ts.segments, columns_to_add]], + update_exog=False, + regressors=new_regressors, + ) + if len(columns_to_update) != 0: + ts.add_columns_from_pandas( + df_update=df_transformed.loc[pd.IndexSlice[:], pd.IndexSlice[ts.segments, columns_to_update]] + ) return ts @abstractmethod From 951dfe2f3457db69a35291cb6f31adb7074d1981 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Wed, 17 Aug 2022 10:34:18 +0300 Subject: [PATCH 19/23] Update changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 858f4e73e..b6276e88b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,8 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - - Base class for transforms now works with TSDataset ([835](https://github.com/tinkoff-ai/etna/pull/835)) - All the transforms now has `in_column` attribute ([#820](https://github.com/tinkoff-ai/etna/pull/820)) -- -- +- Change name of `remove_columns` to `drop_features`, separate method `add_columns_from_pandas` to `add_columns_from_pandas` and `update_columns_from_pandas` ([835](https://github.com/tinkoff-ai/etna/pull/835)) +- ### Fixed - - From 88b027cb8899055b44fef8a17ad59836068997e7 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Wed, 17 Aug 2022 12:24:19 +0300 Subject: [PATCH 20/23] Fix tests --- etna/transforms/base.py | 2 +- tests/test_transforms/test_base/test_base.py | 20 +++++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/etna/transforms/base.py b/etna/transforms/base.py index cf4294765..d2307739d 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -68,7 +68,7 @@ def _update_dataset(self, ts: TSDataset, df: pd.DataFrame, df_transformed: pd.Da regressors=new_regressors, ) if len(columns_to_update) != 0: - ts.add_columns_from_pandas( + ts.update_columns_from_pandas( df_update=df_transformed.loc[pd.IndexSlice[:], pd.IndexSlice[ts.segments, columns_to_update]] ) return ts diff --git a/tests/test_transforms/test_base/test_base.py b/tests/test_transforms/test_base/test_base.py index 02993e2ee..ba50bbd60 100644 --- a/tests/test_transforms/test_base/test_base.py +++ b/tests/test_transforms/test_base/test_base.py @@ -48,8 +48,9 @@ def test_required_features(in_column, expected_features): def test_update_dataset_remove_columns(remove_columns_df): - ts = Mock() df, df_transformed = remove_columns_df + ts = TSDataset(df=df, freq="D") + ts.drop_features = Mock() expected_features_to_remove = list( set(df.columns.get_level_values("feature")) - set(df_transformed.columns.get_level_values("feature")) ) @@ -60,14 +61,23 @@ def test_update_dataset_remove_columns(remove_columns_df): def test_update_dataset_update_columns(remove_columns_df): - ts = Mock() + df, df_transformed = remove_columns_df + ts = TSDataset(df=df, freq="D") + ts.update_columns_from_pandas = Mock() + transform = NewTransformMock() + + transform._update_dataset(ts=ts, df=df, df_transformed=df_transformed) + ts.update_columns_from_pandas.assert_called() + + +def test_update_dataset_add_columns(remove_columns_df): df_transformed, df = remove_columns_df + ts = TSDataset(df=df, freq="D") + ts.add_columns_from_pandas = Mock() transform = NewTransformMock() transform._update_dataset(ts=ts, df=df, df_transformed=df_transformed) - ts.update_columns_from_pandas.assert_called_with( - df_update=df_transformed, update_exog=False, regressors=transform.get_regressors_info() - ) + ts.add_columns_from_pandas.assert_called() @pytest.mark.parametrize( From f06f073026a1bf3600866da91e9bad85930247e2 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Wed, 17 Aug 2022 17:16:41 +0300 Subject: [PATCH 21/23] Fixes --- etna/datasets/tsdataset.py | 4 +-- tests/test_datasets/test_dataset.py | 42 ++++++----------------------- 2 files changed, 10 insertions(+), 36 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index e4608eaaf..eb1583121 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -922,8 +922,8 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False): features: List of features to drop. drop_from_exog: - If False, drop features only from df. Features will appear again in df after make_future. - If True, drop features from df and df_exog. Features won't appear in df after make_future. + * If False, drop features only from df. Features will appear again in df after make_future. + * If True, drop features from df and df_exog. Features won't appear in df after make_future. """ dfs = [("df", self.df)] if drop_from_exog: diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index b7c14735d..a1f82718b 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -39,7 +39,7 @@ def tsdf_with_exog(random_seed) -> TSDataset: return ts -@pytest.fixture() +@pytest.fixture def df_and_regressors() -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]: timestamp = pd.date_range("2021-01-01", "2021-02-01") df_1 = pd.DataFrame({"timestamp": timestamp, "target": 11, "segment": "1"}) @@ -56,7 +56,7 @@ def df_and_regressors() -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]: return df, df_exog, ["regressor_1", "regressor_2"] -@pytest.fixture() +@pytest.fixture def df_update_add_column() -> pd.DataFrame: timestamp = pd.date_range("2021-01-01", "2021-02-12") df_1 = pd.DataFrame({"timestamp": timestamp, "new_column": 100, "segment": "1"}) @@ -66,7 +66,7 @@ def df_update_add_column() -> pd.DataFrame: return df -@pytest.fixture() +@pytest.fixture def df_update_update_column() -> pd.DataFrame: timestamp = pd.date_range("2021-01-01", "2021-02-12") df_1 = pd.DataFrame({"timestamp": timestamp, "target": 100, "segment": "1"}) @@ -76,7 +76,7 @@ def df_update_update_column() -> pd.DataFrame: return df -@pytest.fixture() +@pytest.fixture def df_updated_add_column() -> pd.DataFrame: timestamp = pd.date_range("2021-01-01", "2021-02-01") df_1 = pd.DataFrame({"timestamp": timestamp, "target": 11, "new_column": 100, "segment": "1"}) @@ -87,7 +87,7 @@ def df_updated_add_column() -> pd.DataFrame: return df -@pytest.fixture() +@pytest.fixture def df_updated_update_column() -> pd.DataFrame: timestamp = pd.date_range("2021-01-01", "2021-02-01") df_1 = pd.DataFrame({"timestamp": timestamp, "target": 100, "segment": "1"}) @@ -97,7 +97,7 @@ def df_updated_update_column() -> pd.DataFrame: return df -@pytest.fixture() +@pytest.fixture def df_exog_updated_add_column() -> pd.DataFrame: timestamp = pd.date_range("2020-12-01", "2021-02-12") df_1 = pd.DataFrame({"timestamp": timestamp, "regressor_1": 1, "regressor_2": 2, "new_column": 100, "segment": "1"}) @@ -116,33 +116,7 @@ def df_exog_updated_add_column() -> pd.DataFrame: return df_exog -""" -@pytest.fixture() -def df_and_regressors_updated(df_and_regressors, df_update) -> Tuple[pd.DataFrame, pd.DataFrame]: - df, df_exog, _ = df_and_regressors - - df = TSDataset.to_flatten(df) - df.loc[df["segment"] == "1", "new_column"] = 100 - df.loc[df["segment"] == "1", "regressor_1"] = 10 - df.loc[df["segment"] == "1", "regressor_2"] = 2 - df.loc[df["segment"] == "2", "new_column"] = 200 - df.loc[df["segment"] == "2", "regressor_1"] = 20 - df.loc[df["segment"] == "2", "regressor_2"] = 4 - df = TSDataset.to_dataset(df) - - df_exog = df_exog.reindex(df_update.index) - df_exog = TSDataset.to_flatten(df_exog) - df_exog.loc[df_exog["segment"] == "1", "new_column"] = 100 - df_exog.loc[df_exog["segment"] == "1", "regressor_1"] = 10 - df_exog.loc[df_exog["segment"] == "2", "new_column"] = 200 - df_exog.loc[df_exog["segment"] == "2", "regressor_1"] = 20 - df_exog = TSDataset.to_dataset(df_exog) - - return df, df_exog -""" - - -@pytest.fixture() +@pytest.fixture def df_and_regressors_flat() -> Tuple[pd.DataFrame, pd.DataFrame]: """Return flat versions of df and df_exog.""" timestamp = pd.date_range("2021-01-01", "2021-02-01") @@ -1016,7 +990,7 @@ def test_add_columns_from_pandas_update_df_exog(df_and_regressors, df_update_add df, df_exog, _ = df_and_regressors ts = TSDataset(df=df, freq="D", df_exog=df_exog) ts.add_columns_from_pandas(df_update=df_update_add_column, update_exog=True) - pd.testing.assert_frame_equal(ts.df_exog, df_exog_updated_add_column, check_dtype=False) + pd.testing.assert_frame_equal(ts.df_exog, df_exog_updated_add_column) @pytest.mark.parametrize( From 857ff040c8517fd524560467414bd61fe1caaccd Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Fri, 19 Aug 2022 07:19:26 +0300 Subject: [PATCH 22/23] Remove implementation of inverse_transform from the base class --- etna/transforms/base.py | 27 +----------------- tests/test_transforms/test_base/test_base.py | 30 -------------------- 2 files changed, 1 insertion(+), 56 deletions(-) diff --git a/etna/transforms/base.py b/etna/transforms/base.py index d2307739d..a0ec00c64 100644 --- a/etna/transforms/base.py +++ b/etna/transforms/base.py @@ -15,12 +15,6 @@ class FutureMixin: """Mixin for transforms that can convert non-regressor column to a regressor one.""" -def override(method): - """Indicate the fact of overriding the method.""" - method.is_overridden = True - return method - - class NewTransform(ABC, BaseMixin): """Base class to create any transforms to apply to data.""" @@ -156,25 +150,10 @@ def fit_transform(self, ts: TSDataset) -> TSDataset: """ return self.fit(ts=ts).transform(ts=ts) - def _inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: - """Inverse transform dataframe. - - Parameters - ---------- - df: - Dataframe in etna wide format. - - Returns - ------- - : - Dataframe in etna wide format after applying inverse transformation. - """ - return df - def inverse_transform(self, ts: TSDataset) -> TSDataset: """Inverse transform TSDataset. - Should be reimplemented in the classes with reimplemented _inverse_transform method. + Should be reimplemented in the subclasses where necessary. Parameters ---------- @@ -186,10 +165,6 @@ def inverse_transform(self, ts: TSDataset) -> TSDataset: : TSDataset after applying inverse transformation. """ - if hasattr(self._inverse_transform, "is_overridden"): - df = ts.to_pandas(flatten=False, features="all") - df_transformed = self._inverse_transform(df=df) - ts = self._update_dataset(ts=ts, df=df, df_transformed=df_transformed) return ts diff --git a/tests/test_transforms/test_base/test_base.py b/tests/test_transforms/test_base/test_base.py index ba50bbd60..eb1ac7a3e 100644 --- a/tests/test_transforms/test_base/test_base.py +++ b/tests/test_transforms/test_base/test_base.py @@ -7,7 +7,6 @@ from etna.datasets import TSDataset from etna.datasets import generate_ar_df from etna.transforms import NewTransform -from etna.transforms.base import override class NewTransformMock(NewTransform): @@ -20,12 +19,6 @@ def _fit(self, df: pd.DataFrame): def _transform(self, df: pd.DataFrame) -> pd.DataFrame: return df - @override - def _inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: - df = df.copy() - df["target"] = -100 - return df - @pytest.fixture def remove_columns_df(): @@ -119,26 +112,3 @@ def test_transform_request_update_dataset(remove_columns_df, in_column, required transform.transform(ts=ts) transform._update_dataset.assert_called_with(ts=ts, df=df, df_transformed=df) - - -def test_inverse_transform_update_dataset(remove_columns_df): - df, _ = remove_columns_df - ts = TSDataset(df=df, freq="D") - - transform = NewTransformMock() - transform._update_dataset = Mock() - - transform.inverse_transform(ts=ts) - transform._update_dataset.assert_called() - - -def test_inverse_transform_not_update_dataset_if_not_overriden(remove_columns_df): - df, _ = remove_columns_df - ts = TSDataset(df=df, freq="D") - - transform = NewTransformMock() - transform._update_dataset = Mock() - transform._inverse_transform = lambda df: df - - transform.inverse_transform(ts=ts) - assert not transform._update_dataset.called From 660a591da487841ff0f8e37147b5573df8f198dd Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Fri, 19 Aug 2022 10:21:21 +0300 Subject: [PATCH 23/23] Fix typo --- etna/datasets/tsdataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index eb1583121..731cf89c6 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -875,7 +875,7 @@ def update_columns_from_pandas(self, df_update: pd.DataFrame): """Update the existing columns in the dataset with the new values from pandas dataframe. Before updating columns in df, columns of df_update will be cropped by the last timestamp in df. - If columns in df_exog are not updated. If you wish to update the df_exog, create the new + Columns in df_exog are not updated. If you wish to update the df_exog, create the new instance of TSDataset. Parameters