Skip to content

Fix constant_value in imputer #819

Merged
merged 4 commits into from
Jul 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
-
### Fixed
-
- Fix missing `constant_value` in `TimeSeriesImputerTransform` ([#819](https://github.com/tinkoff-ai/etna/pull/819))
-
-
- Make in-sample predictions of SARIMAXModel non-dynamic in all cases ([#812](https://github.com/tinkoff-ai/etna/pull/812))
Expand Down
16 changes: 11 additions & 5 deletions etna/transforms/missing_values/imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(
window: int,
seasonality: int,
default_value: Optional[float],
constant_value: int = 0,
constant_value: float = 0,
):
"""
Create instance of _OneSegmentTimeSeriesImputerTransform.
Expand Down Expand Up @@ -74,6 +74,8 @@ def __init__(
the length of the seasonality
default_value:
value which will be used to impute the NaNs left after applying the imputer with the chosen strategy
constant_value:
value to fill gaps in "constant" strategy

Raises
------
Expand All @@ -82,11 +84,11 @@ def __init__(
"""
self.in_column = in_column
self.strategy = ImputerMode(strategy)
self.constant_value = constant_value
self.window = window
self.seasonality = seasonality
self.default_value = default_value
self.fill_value: Optional[int] = None
self.constant_value = constant_value
self.fill_value: Optional[float] = None
julia-shenshina marked this conversation as resolved.
Show resolved Hide resolved
self.nan_timestamps: Optional[List[pd.Timestamp]] = None

def fit(self, df: pd.DataFrame) -> "_OneSegmentTimeSeriesImputerTransform":
Expand All @@ -110,7 +112,7 @@ def fit(self, df: pd.DataFrame) -> "_OneSegmentTimeSeriesImputerTransform":
self.nan_timestamps = series[series.isna()].index
if self.strategy == ImputerMode.zero:
warnings.warn(
"zero strategy will be removed in etna 1.12.0. Use constant strategy instead.",
"zero strategy will be removed in etna 2.0.0. Use constant strategy instead.",
DeprecationWarning,
stacklevel=2,
)
Expand Down Expand Up @@ -227,7 +229,7 @@ def __init__(
window: int = -1,
seasonality: int = 1,
default_value: Optional[float] = None,
constant_value: int = 0,
constant_value: float = 0,
):
"""
Create instance of TimeSeriesImputerTransform.
Expand Down Expand Up @@ -262,6 +264,8 @@ def __init__(
the length of the seasonality
default_value:
value which will be used to impute the NaNs left after applying the imputer with the chosen strategy
constant_value:
value to fill gaps in "constant" strategy

Raises
------
Expand All @@ -273,13 +277,15 @@ def __init__(
self.window = window
self.seasonality = seasonality
self.default_value = default_value
self.constant_value = constant_value
super().__init__(
transform=_OneSegmentTimeSeriesImputerTransform(
in_column=self.in_column,
strategy=self.strategy,
window=self.window,
seasonality=self.seasonality,
default_value=self.default_value,
constant_value=self.constant_value,
)
)

Expand Down
42 changes: 34 additions & 8 deletions tests/test_transforms/test_missing_values/test_impute_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,25 +78,37 @@ def test_all_missing_impute_fail_two_segments(df_all_missing_two_segments: pd.Da
_ = imputer.fit_transform(df_all_missing_two_segments)


def test_one_missing_value_zero(df_with_missing_value_x_index: pd.DataFrame):
"""Check that imputer with constant-strategy with zero value correctly in case of one missing value in data."""
@pytest.mark.parametrize("constant_value", (0, 42))
def test_one_missing_value_constant(df_with_missing_value_x_index: pd.DataFrame, constant_value: float):
"""Check that imputer with constant-strategy works correctly in case of one missing value in data."""
df, idx = df_with_missing_value_x_index
imputer = _OneSegmentTimeSeriesImputerTransform(
in_column="target", strategy="constant", window=-1, seasonality=1, default_value=None
in_column="target",
strategy="constant",
window=-1,
seasonality=1,
default_value=None,
constant_value=constant_value,
)
result = imputer.fit_transform(df)["target"]
assert result.loc[idx] == 0
assert result.loc[idx] == constant_value
assert not result.isna().any()


def test_range_missing_zero(df_with_missing_range_x_index: pd.DataFrame):
"""Check that imputer with constant-strategy with zero value works correctly in case of range of missing values in data."""
@pytest.mark.parametrize("constant_value", (0, 42))
def test_range_missing_constant(df_with_missing_range_x_index: pd.DataFrame, constant_value: float):
"""Check that imputer with constant-strategy works correctly in case of range of missing values in data."""
df, rng = df_with_missing_range_x_index
imputer = _OneSegmentTimeSeriesImputerTransform(
in_column="target", strategy="constant", window=-1, seasonality=1, default_value=None
in_column="target",
strategy="constant",
window=-1,
seasonality=1,
default_value=None,
constant_value=constant_value,
)
result = imputer.fit_transform(df)["target"]
expected_series = pd.Series(index=rng, data=[0 for _ in rng], name="target")
expected_series = pd.Series(index=rng, data=[constant_value for _ in rng], name="target")
np.testing.assert_array_almost_equal(result.loc[rng].reset_index(drop=True), expected_series)
assert not result.isna().any()

Expand Down Expand Up @@ -360,3 +372,17 @@ def test_fit_transform_nans_at_the_end(fill_strategy, ts_diff_endings):
imputer = TimeSeriesImputerTransform(in_column="target", strategy=fill_strategy)
ts_diff_endings.fit_transform([imputer])
assert (ts_diff_endings[:, :, "target"].isna()).sum().sum() == 0


@pytest.mark.parametrize("constant_value", (0, 32))
def test_constant_fill_strategy(df_with_missing_range_x_index_two_segments: pd.DataFrame, constant_value: float):
raw_df, rng = df_with_missing_range_x_index_two_segments
inferred_freq = pd.infer_freq(raw_df.index[-5:])
ts = TSDataset(raw_df, freq=inferred_freq)
imputer = TimeSeriesImputerTransform(
in_column="target", strategy="constant", constant_value=constant_value, default_value=constant_value - 1
)
ts.fit_transform([imputer])
df = ts.to_pandas(flatten=False)
for segment in ["segment_1", "segment_2"]:
np.testing.assert_array_equal(df.loc[rng][segment]["target"].values, [constant_value] * 5)