Skip to content

New base class for transforms #835

Merged
merged 23 commits into from
Aug 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
a004b97
Add get_regresors_info
alex-hse-repository Aug 1, 2022
df117a9
Fix method naming for remove_columns
alex-hse-repository Aug 4, 2022
d7e6ffe
Add new version of base class for transforms
alex-hse-repository Aug 4, 2022
24f87bc
Replace DummyInColumnMixin with default in_column in base class
alex-hse-repository Aug 5, 2022
32a3ac8
Fix required_features property
alex-hse-repository Aug 5, 2022
8c583f6
Add tests
alex-hse-repository Aug 5, 2022
7ef3e21
Update Changelog
alex-hse-repository Aug 5, 2022
04fd810
Add drop_exog flag
alex-hse-repository Aug 10, 2022
549efca
Fix methods in tsdataset
alex-hse-repository Aug 12, 2022
ef5df42
Add common logic to inverse_transform
alex-hse-repository Aug 15, 2022
b49185a
Add tests for base transform
alex-hse-repository Aug 15, 2022
efa2447
Update dataset methods
alex-hse-repository Aug 15, 2022
8070a75
Comment unfinished tests
alex-hse-repository Aug 15, 2022
8ec19da
Add override decorator
alex-hse-repository Aug 16, 2022
f4c91bd
Remove unnesesary file
alex-hse-repository Aug 16, 2022
2b5f276
Separate method for updating/adding columns
alex-hse-repository Aug 17, 2022
c32cbce
Update tests for dataset
alex-hse-repository Aug 17, 2022
41d1982
Update base class
alex-hse-repository Aug 17, 2022
951dfe2
Update changelog
alex-hse-repository Aug 17, 2022
88b027c
Fix tests
alex-hse-repository Aug 17, 2022
f06f073
Fixes
alex-hse-repository Aug 17, 2022
857ff04
Remove implementation of inverse_transform from the base class
alex-hse-repository Aug 19, 2022
660a591
Fix typo
alex-hse-repository Aug 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
-
-
-
- Base class for transforms now works with TSDataset ([835](https://github.com/tinkoff-ai/etna/pull/835))
- All the transforms now has `in_column` attribute ([#820](https://github.com/tinkoff-ai/etna/pull/820))
-
-
- Change name of `remove_columns` to `drop_features`, separate method `add_columns_from_pandas` to `add_columns_from_pandas` and `update_columns_from_pandas` ([835](https://github.com/tinkoff-ai/etna/pull/835))
-
### Fixed
-
-
Expand Down
76 changes: 49 additions & 27 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,50 +871,72 @@ def train_test_split(

return train, test

def update_columns_from_pandas(self, df: pd.DataFrame, regressors: Optional[List[str]] = None):
"""Update the dataset with the new columns from pandas dataframe.
def update_columns_from_pandas(self, df_update: pd.DataFrame):
"""Update the existing columns in the dataset with the new values from pandas dataframe.

It is recommended to add the exogenous regressor using the constructor of the TSDataset.
This method does not add the regressors as exogenous data in df_exog, but only update the df attribute.
Before updating columns in df, columns of df_update will be cropped by the last timestamp in df.
Columns in df_exog are not updated. If you wish to update the df_exog, create the new
instance of TSDataset.

Parameters
----------
df:
Dataframe with the new columns in wide ETNA format.
If columns with the same names already exist in the dataset, then values will be updated.
regressors:
List of regressors in the passed dataframe
df_update:
Dataframe with new values in wide ETNA format.
"""
columns_in_dataset = self.columns.get_level_values("feature")
new_columns = df.columns.get_level_values("feature")

columns_to_update = list(set(columns_in_dataset) & set(new_columns))
self.df.loc[:, self.idx[self.segments, columns_to_update]] = df.loc[
:, self.idx[self.segments, columns_to_update]
columns_to_update = sorted(set(df_update.columns.get_level_values("feature")))
self.df.loc[:, self.idx[self.segments, columns_to_update]] = df_update.loc[
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
: self.df.index.max(), self.idx[self.segments, columns_to_update]
]

columns_to_add = list(set(new_columns) - set(columns_in_dataset))
if len(columns_to_add) != 0:
self.df = pd.concat((self.df, df.loc[:, self.idx[:, columns_to_add]]), axis=1).sort_index(axis=1)
def add_columns_from_pandas(
self, df_update: pd.DataFrame, update_exog: bool = False, regressors: Optional[List[str]] = None
):
"""Update the dataset with the new columns from pandas dataframe.

Before updating columns in df, columns of df_update will be cropped by the last timestamp in df.

Parameters
----------
df_update:
Dataframe with the new columns in wide ETNA format.
update_exog:
If True, update columns also in df_exog.
If you wish to add new regressors in the dataset it is recommended to turn on this flag.
regressors:
List of regressors in the passed dataframe.
"""
self.df = pd.concat((self.df, df_update[: self.df.index.max()]), axis=1).sort_index(axis=1)
if update_exog:
if self.df_exog is None:
self.df_exog = df_update
else:
self.df_exog = pd.concat((self.df_exog, df_update), axis=1).sort_index(axis=1)
if regressors is not None:
self._regressors = list(set(self._regressors) | set(regressors))

def remove_columns(self, columns: List[str]):
"""Remove columns from the dataset.

Columns that are not presented in the dataset will be ignored
def drop_features(self, features: List[str], drop_from_exog: bool = False):
"""Drop columns with features from the dataset.

Parameters
----------
columns:
List of columns to be removed
features:
List of features to drop.
drop_from_exog:
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
* If False, drop features only from df. Features will appear again in df after make_future.
* If True, drop features from df and df_exog. Features won't appear in df after make_future.
"""
for df in [self.df, self.df_exog, self.raw_df]:
dfs = [("df", self.df)]
if drop_from_exog:
dfs.append(("df_exog", self.df_exog))

for name, df in dfs:
columns_in_df = df.columns.get_level_values("feature")
columns_to_remove = list(set(columns_in_df) & set(columns))
columns_to_remove = list(set(columns_in_df) & set(features))
unknown_columns = set(features) - set(columns_to_remove)
if len(unknown_columns) != 0:
warnings.warn(f"Features {unknown_columns} are not present in {name}!")
df.drop(columns=columns_to_remove, level="feature", inplace=True)
self._regressors = list(set(self._regressors) - set(columns))
self._regressors = list(set(self._regressors) - set(features))

@property
def index(self) -> pd.core.indexes.datetimes.DatetimeIndex:
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from etna.transforms.base import NewTransform
from etna.transforms.base import PerSegmentWrapper
from etna.transforms.base import Transform
from etna.transforms.decomposition import BinsegTrendTransform
Expand Down
157 changes: 154 additions & 3 deletions etna/transforms/base.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,171 @@
from abc import ABC
from abc import abstractmethod
from copy import deepcopy
from typing import List
from typing import Union

import pandas as pd
from typing_extensions import Literal

from etna.core import BaseMixin
from etna.datasets import TSDataset


class FutureMixin:
"""Mixin for transforms that can convert non-regressor column to a regressor one."""


class DymmyInColumnMixin:
"""Mixin for transforms that has no explicit in_column."""
class NewTransform(ABC, BaseMixin):
"""Base class to create any transforms to apply to data."""

def __init__(self, in_column: Union[Literal["all"], List[str], str] = "target"):
self.in_column = in_column

def get_regressors_info(self) -> List[str]:
"""Return the list with regressors created by the transform.

Returns
-------
:
List with regressors created by the transform.
"""
return []

@property
def required_features(self) -> Union[Literal["all"], List[str]]:
"""Get the list of required features."""
required_features = self.in_column
if isinstance(required_features, list):
return required_features
elif isinstance(required_features, str):
if required_features == "all":
return "all"
return [required_features]
else:
raise ValueError("in_column attribute is in incorrect format!")

def _update_dataset(self, ts: TSDataset, df: pd.DataFrame, df_transformed: pd.DataFrame) -> TSDataset:
"""Update TSDataset based on the difference between dfs."""
columns_before = set(df.columns.get_level_values("feature"))
columns_after = set(df_transformed.columns.get_level_values("feature"))
columns_to_update = list(set(columns_before) & set(columns_after))
columns_to_add = list(set(columns_after) - set(columns_before))
columns_to_remove = list(set(columns_before) - set(columns_after))

if len(columns_to_remove) != 0:
ts.drop_features(features=columns_to_remove, drop_from_exog=False)
if len(columns_to_add) != 0:
new_regressors = self.get_regressors_info()
ts.add_columns_from_pandas(
df_update=df_transformed.loc[pd.IndexSlice[:], pd.IndexSlice[ts.segments, columns_to_add]],
update_exog=False,
regressors=new_regressors,
)
if len(columns_to_update) != 0:
ts.update_columns_from_pandas(
df_update=df_transformed.loc[pd.IndexSlice[:], pd.IndexSlice[ts.segments, columns_to_update]]
)
return ts

@abstractmethod
def _fit(self, df: pd.DataFrame):
"""Fit the transform.

Should be implemented by user.

Parameters
----------
df:
Dataframe in etna wide format.
"""
pass

def fit(self, ts: TSDataset) -> "NewTransform":
"""Fit the transform.

Parameters
----------
ts:
Dataset to fit the transform on.

Returns
-------
:
The fitted transform instance.
"""
df = ts.to_pandas(flatten=False, features=self.required_features)
self._fit(df=df)
return self

@abstractmethod
def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Transform dataframe.

Should be implemented by user

Parameters
----------
df:
Dataframe in etna wide format.

in_column = "target"
Returns
-------
:
Transformed Dataframe in etna wide format.
"""
pass

def transform(self, ts: TSDataset) -> TSDataset:
"""Transform TSDataset inplace.

Parameters
----------
ts:
Dataset to transform.

Returns
-------
:
Transformed TSDataset.
"""
df = ts.to_pandas(flatten=False, features=self.required_features)
df_transformed = self._transform(df=df)
ts = self._update_dataset(ts=ts, df=df, df_transformed=df_transformed)
return ts

def fit_transform(self, ts: TSDataset) -> TSDataset:
"""Fit and transform TSDataset.

May be reimplemented. But it is not recommended.

Parameters
----------
ts:
TSDataset to transform.

Returns
-------
:
Transformed TSDataset.
"""
return self.fit(ts=ts).transform(ts=ts)

def inverse_transform(self, ts: TSDataset) -> TSDataset:
"""Inverse transform TSDataset.

Should be reimplemented in the subclasses where necessary.

Parameters
----------
ts:
TSDataset to be inverse transformed.

Returns
-------
:
TSDataset after applying inverse transformation.
"""
return ts
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved


class Transform(ABC, BaseMixin):
Expand Down
3 changes: 1 addition & 2 deletions etna/transforms/encoders/mean_segment_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@
import pandas as pd

from etna.transforms import Transform
from etna.transforms.base import DymmyInColumnMixin
from etna.transforms.base import FutureMixin
from etna.transforms.math.statistics import MeanTransform


class MeanSegmentEncoderTransform(Transform, FutureMixin, DymmyInColumnMixin):
class MeanSegmentEncoderTransform(Transform, FutureMixin):
"""Makes expanding mean target encoding of the segment. Creates column 'segment_mean'."""

idx = pd.IndexSlice
Expand Down
3 changes: 1 addition & 2 deletions etna/transforms/encoders/segment_encoder.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import pandas as pd
from sklearn import preprocessing

from etna.transforms.base import DymmyInColumnMixin
from etna.transforms.base import FutureMixin
from etna.transforms.base import Transform


class SegmentEncoderTransform(Transform, FutureMixin, DymmyInColumnMixin):
class SegmentEncoderTransform(Transform, FutureMixin):
"""Encode segment label to categorical. Creates column 'segment_code'."""

idx = pd.IndexSlice
Expand Down
3 changes: 1 addition & 2 deletions etna/transforms/feature_selection/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from abc import ABC
from typing import List
from typing import Optional
from typing import Sequence
from typing import Union

import pandas as pd
Expand All @@ -19,7 +18,7 @@ def __init__(self, features_to_use: Union[List[str], Literal["all"]] = "all", re
self.selected_features: List[str] = []
self.return_features = return_features
self._df_removed: Optional[pd.DataFrame] = None
self.in_column: Union[Sequence[str], Literal["all"]] = self.features_to_use
self.in_column = self.features_to_use

def _get_features_to_use(self, df: pd.DataFrame) -> List[str]:
"""Get list of features from the dataframe to perform the selection on."""
Expand Down
6 changes: 2 additions & 4 deletions etna/transforms/feature_selection/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,12 @@

import pandas as pd

from etna.transforms.base import DymmyInColumnMixin
from etna.transforms.base import Transform


class FilterFeaturesTransform(Transform, DymmyInColumnMixin):
class FilterFeaturesTransform(Transform):
"""Filters features in each segment of the dataframe."""

in_column = "all"

def __init__(
self,
include: Optional[Sequence[str]] = None,
Expand Down Expand Up @@ -43,6 +40,7 @@ def __init__(
self.exclude = list(set(exclude))
else:
raise ValueError("There should be exactly one option set: include or exclude")
self.in_column = "all"
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved

def fit(self, df: pd.DataFrame) -> "FilterFeaturesTransform":
"""Fit method does nothing and is kept for compatibility.
Expand Down
6 changes: 2 additions & 4 deletions etna/transforms/nn/pytorch_forecasting.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

from etna import SETTINGS
from etna.datasets.tsdataset import TSDataset
from etna.transforms.base import DymmyInColumnMixin
from etna.transforms.base import Transform

if SETTINGS.torch_required:
Expand All @@ -28,16 +27,14 @@
NORMALIZER = Union[TorchNormalizer, NaNLabelEncoder, EncoderNormalizer]


class PytorchForecastingTransform(Transform, DymmyInColumnMixin):
class PytorchForecastingTransform(Transform):
"""Transform for models from PytorchForecasting library.

Notes
-----
This transform should be added at the very end of ``transforms`` parameter.
"""

in_column = "all"

def __init__(
self,
max_encoder_length: int = 30,
Expand Down Expand Up @@ -93,6 +90,7 @@ def __init__(
self.lags = lags if lags else {}
self.scalers = scalers if scalers else {}
self.pf_dataset_predict: Optional[TimeSeriesDataSet] = None
self.in_column = "all"

def fit(self, df: pd.DataFrame) -> "PytorchForecastingTransform":
"""
Expand Down
Loading