From ac53ed17d4bbb7409ea083c7b4b99965cecddef9 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Tue, 20 Jun 2023 10:21:39 +0300 Subject: [PATCH 01/10] Fix statistics relevance table --- etna/analysis/feature_relevance/relevance_table.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/etna/analysis/feature_relevance/relevance_table.py b/etna/analysis/feature_relevance/relevance_table.py index e37da740a..aa58a9c4c 100644 --- a/etna/analysis/feature_relevance/relevance_table.py +++ b/etna/analysis/feature_relevance/relevance_table.py @@ -36,7 +36,6 @@ def _prepare_df(df: pd.DataFrame, df_exog: pd.DataFrame, segment: str, regressor def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> pd.DataFrame: """Calculate relevance table with p-values from tsfresh. - Parameters ---------- df: @@ -48,6 +47,10 @@ def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> p ------- pd.DataFrame dataframe with p-values. + + Notes + ----- + Time complexity of this method is O(n_segments * n_features * history_len * log(history_len)) """ regressors = sorted(df_exog.columns.get_level_values("feature").unique()) segments = sorted(df.columns.get_level_values("segment").unique()) @@ -64,7 +67,9 @@ def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> p "Exogenous data contains columns with category type! It will be converted to float. If this is not desired behavior, use encoders." ) - relevance = calculate_relevance_table(X=df_exog_seg, y=df_seg)[["feature", "p_value"]].values + relevance = calculate_relevance_table(X=df_exog_seg, y=df_seg, ml_task="regression")[ + ["feature", "p_value"] + ].values result[k] = np.array(sorted(relevance, key=lambda x: x[0]))[:, 1] relevance_table = pd.DataFrame(result) relevance_table.index = segments From 1810698e806d2c5e3b77aebe878a3df29d76cfdb Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Tue, 20 Jun 2023 10:22:04 +0300 Subject: [PATCH 02/10] Speed up mrmr --- .../feature_selection/mrmr_selection.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/etna/analysis/feature_selection/mrmr_selection.py b/etna/analysis/feature_selection/mrmr_selection.py index 5643877fb..9ba84c258 100644 --- a/etna/analysis/feature_selection/mrmr_selection.py +++ b/etna/analysis/feature_selection/mrmr_selection.py @@ -58,6 +58,10 @@ def mrmr( ------- selected_features: List[str] list of ``top_k`` selected regressors, sorted by their importance + + Notes + ----- + Time complexity of this method is O(top_k * n_segments * n_features * history_len) """ relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)] redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)] @@ -65,6 +69,7 @@ def mrmr( relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0) all_features = relevance.index.to_list() + segments = set(regressors.columns.get_level_values("segment")) selected_features: List[str] = [] not_selected_features = all_features.copy() @@ -76,20 +81,13 @@ def mrmr( score_denominator = pd.Series(1, index=not_selected_features) if i > 0: last_selected_feature = selected_features[-1] - not_selected_regressors = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, not_selected_features]] - last_selected_regressor = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, last_selected_feature]] + candidate_regressors = regressors.loc[ + pd.IndexSlice[:], pd.IndexSlice[:, not_selected_features + [last_selected_feature]] + ] + segment_redundancy = [candidate_regressors[segment].corr() for segment in segments] redundancy_table.loc[not_selected_features, last_selected_feature] = ( - not_selected_regressors.apply(lambda col: last_selected_regressor.corrwith(col)) # noqa: B023 - .abs() - .groupby("feature") - .apply(redundancy_aggregation_fn) - .T.groupby("feature") - .apply(redundancy_aggregation_fn) - .clip(atol) - .fillna(np.inf) - .loc[not_selected_features] - .values.squeeze() + pd.concat(segment_redundancy).apply(redundancy_aggregation_fn, axis=0).loc[not_selected_features] ) score_denominator = redundancy_table.loc[not_selected_features, selected_features].mean(axis=1) From a027062168d0625d205c6af454364f6dde5762fc Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Thu, 22 Jun 2023 10:18:55 +0300 Subject: [PATCH 03/10] Add fast_redandancy flag to mrmr --- .../feature_selection/mrmr_selection.py | 45 +++++++++++++++---- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/etna/analysis/feature_selection/mrmr_selection.py b/etna/analysis/feature_selection/mrmr_selection.py index 9ba84c258..f888836d4 100644 --- a/etna/analysis/feature_selection/mrmr_selection.py +++ b/etna/analysis/feature_selection/mrmr_selection.py @@ -1,3 +1,4 @@ +import warnings from enum import Enum from typing import List @@ -26,6 +27,7 @@ def mrmr( relevance_table: pd.DataFrame, regressors: pd.DataFrame, top_k: int, + fast_redundancy: bool = False, relevance_aggregation_mode: str = AggregationMode.mean, redundancy_aggregation_mode: str = AggregationMode.mean, atol: float = 1e-10, @@ -47,6 +49,9 @@ def mrmr( dataframe with regressors in etna format top_k: num of regressors to select; if there are not enough regressors, then all will be selected + fast_redundancy: + * True: compute redundancy only inside the the segments, time complexity O(top_k * n_segments * n_features * history_len) + * False: compute redundancy for all the pairs of segments, time complexity O(top_k * n_segments * n_features * history_len) relevance_aggregation_mode: the method for relevance values per-segment aggregation redundancy_aggregation_mode: @@ -58,11 +63,13 @@ def mrmr( ------- selected_features: List[str] list of ``top_k`` selected regressors, sorted by their importance - - Notes - ----- - Time complexity of this method is O(top_k * n_segments * n_features * history_len) """ + if not fast_redundancy: + warnings.warn( + "`fast_redundancy=False` was added for backward compatibility and will be removed in etna 3.0.0.", + DeprecationWarning, + stacklevel=2, + ) relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)] redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)] @@ -81,13 +88,33 @@ def mrmr( score_denominator = pd.Series(1, index=not_selected_features) if i > 0: last_selected_feature = selected_features[-1] - candidate_regressors = regressors.loc[ - pd.IndexSlice[:], pd.IndexSlice[:, not_selected_features + [last_selected_feature]] - ] + last_selected_regressor = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, last_selected_feature]] + not_selected_regressors = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, not_selected_features]] + + if fast_redundancy: + segment_redundancy = pd.concat( + [ + not_selected_regressors[segment].apply( + lambda col: last_selected_regressor[segment].corrwith(col) # noqa: B023 + ) + for segment in segments + ] + ).abs() + else: + segment_redundancy = ( + not_selected_regressors.apply(lambda col: last_selected_regressor.corrwith(col)) # noqa: B023 + .abs() + .groupby("feature") + .apply(redundancy_aggregation_fn) + .T.groupby("feature") + ) - segment_redundancy = [candidate_regressors[segment].corr() for segment in segments] redundancy_table.loc[not_selected_features, last_selected_feature] = ( - pd.concat(segment_redundancy).apply(redundancy_aggregation_fn, axis=0).loc[not_selected_features] + segment_redundancy.apply(redundancy_aggregation_fn) + .clip(atol) + .fillna(np.inf) + .loc[not_selected_features] + .values.squeeze() ) score_denominator = redundancy_table.loc[not_selected_features, selected_features].mean(axis=1) From 828f9812d6216685af28e214e58caef659d3a68a Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Thu, 22 Jun 2023 10:19:11 +0300 Subject: [PATCH 04/10] Update tests on mrmr --- .../test_feature_selection/test_mrmr.py | 41 +++++++++++++++---- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/tests/test_analysis/test_feature_selection/test_mrmr.py b/tests/test_analysis/test_feature_selection/test_mrmr.py index bf3b473fc..bc1d2d3ab 100644 --- a/tests/test_analysis/test_feature_selection/test_mrmr.py +++ b/tests/test_analysis/test_feature_selection/test_mrmr.py @@ -58,22 +58,31 @@ def df_with_regressors() -> Dict[str, pd.DataFrame]: } +@pytest.mark.parametrize("fast_redundancy", [True, False]) @pytest.mark.parametrize( "relevance_method, expected_regressors", [(ModelRelevanceTable(), ["regressor_useful_0", "regressor_useful_1", "regressor_useful_2"])], ) -def test_mrmr_right_regressors(df_with_regressors, relevance_method, expected_regressors): +def test_mrmr_right_regressors(df_with_regressors, relevance_method, expected_regressors, fast_redundancy): relevance_table = relevance_method( df=df_with_regressors["target"], df_exog=df_with_regressors["regressors"], model=RandomForestRegressor() ) - selected_regressors = mrmr(relevance_table=relevance_table, regressors=df_with_regressors["regressors"], top_k=3) + selected_regressors = mrmr( + relevance_table=relevance_table, + regressors=df_with_regressors["regressors"], + top_k=3, + fast_redundancy=fast_redundancy, + ) assert set(selected_regressors) == set(expected_regressors) -def test_mrmr_not_depend_on_columns_order(df_with_regressors): +@pytest.mark.parametrize("fast_redundancy", [True, False]) +def test_mrmr_not_depend_on_columns_order(df_with_regressors, fast_redundancy): df, regressors = df_with_regressors["df"], df_with_regressors["regressors"] relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor()) - expected_answer = mrmr(relevance_table=relevance_table, regressors=regressors, top_k=5) + expected_answer = mrmr( + relevance_table=relevance_table, regressors=regressors, top_k=5, fast_redundancy=fast_redundancy + ) columns = list(regressors.columns.get_level_values("feature").unique()) for i in range(10): np.random.shuffle(columns) @@ -81,6 +90,7 @@ def test_mrmr_not_depend_on_columns_order(df_with_regressors): relevance_table=relevance_table[columns], regressors=regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, columns]], top_k=5, + fast_redundancy=fast_redundancy, ) assert answer == expected_answer @@ -131,21 +141,36 @@ def high_relevance_high_redundancy_problem_diff_starts(periods=10): } -def test_mrmr_select_less_redundant_regressor(high_relevance_high_redundancy_problem): +@pytest.mark.parametrize("fast_redundancy", [True, False]) +def test_mrmr_select_less_redundant_regressor(high_relevance_high_redundancy_problem, fast_redundancy): """Check that transform selects the less redundant regressor out of regressors with same relevance.""" relevance_table, regressors = ( high_relevance_high_redundancy_problem["relevance_table"], high_relevance_high_redundancy_problem["regressors"], ) - selected_regressors = mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2) + selected_regressors = mrmr( + relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=fast_redundancy + ) assert set(selected_regressors) == set(high_relevance_high_redundancy_problem["expected_answer"]) -def test_mrmr_select_less_redundant_regressor_diff_start(high_relevance_high_redundancy_problem_diff_starts): +@pytest.mark.parametrize("fast_redundancy", [True, False]) +def test_mrmr_select_less_redundant_regressor_diff_start( + high_relevance_high_redundancy_problem_diff_starts, fast_redundancy +): """Check that transform selects the less redundant regressor out of regressors with same relevance.""" relevance_table, regressors = ( high_relevance_high_redundancy_problem_diff_starts["relevance_table"], high_relevance_high_redundancy_problem_diff_starts["regressors"], ) - selected_regressors = mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2) + selected_regressors = mrmr( + relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=fast_redundancy + ) assert set(selected_regressors) == set(high_relevance_high_redundancy_problem_diff_starts["expected_answer"]) + + +def test_fast_redundancy_deprecation_warning(df_with_regressors): + df, regressors = df_with_regressors["df"], df_with_regressors["regressors"] + relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor()) + with pytest.warns(DeprecationWarning): + mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=False) From cb5c01cb567244d877134eeb0d70de0062ba5e2d Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Thu, 22 Jun 2023 10:19:56 +0300 Subject: [PATCH 05/10] Add fast_redundancy flag to mrmr transforms --- etna/transforms/feature_selection/feature_importance.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/etna/transforms/feature_selection/feature_importance.py b/etna/transforms/feature_selection/feature_importance.py index a1470a613..396b56800 100644 --- a/etna/transforms/feature_selection/feature_importance.py +++ b/etna/transforms/feature_selection/feature_importance.py @@ -127,6 +127,7 @@ def __init__( relevance_table: RelevanceTable, top_k: int, features_to_use: Union[List[str], Literal["all"]] = "all", + fast_redundancy: bool = False, relevance_aggregation_mode: str = AggregationMode.mean, redundancy_aggregation_mode: str = AggregationMode.mean, atol: float = 1e-10, @@ -145,6 +146,9 @@ def __init__( features_to_use: columns of the dataset to select from if "all" value is given, all columns are used + fast_redundancy: + * True: compute redundancy only inside the the segments, time complexity O(top_k * n_segments * n_features * history_len) + * False: compute redundancy for all the pairs of segments, time complexity O(top_k * n_segments * n_features * history_len) relevance_aggregation_mode: the method for relevance values per-segment aggregation redundancy_aggregation_mode: @@ -160,6 +164,7 @@ def __init__( super().__init__(features_to_use=features_to_use, return_features=return_features) self.relevance_table = relevance_table self.top_k = top_k + self.fast_redundancy = fast_redundancy self.relevance_aggregation_mode = relevance_aggregation_mode self.redundancy_aggregation_mode = redundancy_aggregation_mode self.atol = atol @@ -188,6 +193,7 @@ def _fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform": relevance_table=relevance_table, regressors=ts[:, :, features], top_k=self.top_k, + fast_redundancy=self.fast_redundancy, relevance_aggregation_mode=self.relevance_aggregation_mode, redundancy_aggregation_mode=self.redundancy_aggregation_mode, atol=self.atol, From 263cdc1ee5b079f8ccdd6de98901d5aa1944f2b6 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Thu, 22 Jun 2023 15:35:00 +0300 Subject: [PATCH 06/10] Review fixes --- etna/analysis/feature_relevance/relevance_table.py | 2 +- etna/analysis/feature_selection/mrmr_selection.py | 7 +++---- etna/transforms/feature_selection/feature_importance.py | 4 ++-- tests/test_analysis/test_feature_selection/test_mrmr.py | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/etna/analysis/feature_relevance/relevance_table.py b/etna/analysis/feature_relevance/relevance_table.py index aa58a9c4c..c6241940a 100644 --- a/etna/analysis/feature_relevance/relevance_table.py +++ b/etna/analysis/feature_relevance/relevance_table.py @@ -50,7 +50,7 @@ def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> p Notes ----- - Time complexity of this method is O(n_segments * n_features * history_len * log(history_len)) + Time complexity of this method is :math:`O(n\_segments * n\_features * history\_len)` """ regressors = sorted(df_exog.columns.get_level_values("feature").unique()) segments = sorted(df.columns.get_level_values("segment").unique()) diff --git a/etna/analysis/feature_selection/mrmr_selection.py b/etna/analysis/feature_selection/mrmr_selection.py index f888836d4..ba1d84eb3 100644 --- a/etna/analysis/feature_selection/mrmr_selection.py +++ b/etna/analysis/feature_selection/mrmr_selection.py @@ -50,8 +50,8 @@ def mrmr( top_k: num of regressors to select; if there are not enough regressors, then all will be selected fast_redundancy: - * True: compute redundancy only inside the the segments, time complexity O(top_k * n_segments * n_features * history_len) - * False: compute redundancy for all the pairs of segments, time complexity O(top_k * n_segments * n_features * history_len) + * True: compute redundancy only inside the the segments, time complexity :math:`O(top\_k * n\_segments * n\_features * history\_len)` + * False: compute redundancy for all the pairs of segments, time complexity :math:`O(top\_k * n\_segments^2 * n\_features * history\_len)` relevance_aggregation_mode: the method for relevance values per-segment aggregation redundancy_aggregation_mode: @@ -66,9 +66,8 @@ def mrmr( """ if not fast_redundancy: warnings.warn( - "`fast_redundancy=False` was added for backward compatibility and will be removed in etna 3.0.0.", + "Option `fast_redundancy=False` was added for backward compatibility and will be removed in etna 3.0.0.", DeprecationWarning, - stacklevel=2, ) relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)] redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)] diff --git a/etna/transforms/feature_selection/feature_importance.py b/etna/transforms/feature_selection/feature_importance.py index e62566559..38ad2c098 100644 --- a/etna/transforms/feature_selection/feature_importance.py +++ b/etna/transforms/feature_selection/feature_importance.py @@ -191,8 +191,8 @@ def __init__( columns of the dataset to select from if "all" value is given, all columns are used fast_redundancy: - * True: compute redundancy only inside the the segments, time complexity O(top_k * n_segments * n_features * history_len) - * False: compute redundancy for all the pairs of segments, time complexity O(top_k * n_segments * n_features * history_len) + * True: compute redundancy only inside the the segments, time complexity :math:`O(top\_k * n\_segments * n\_features * history\_len) + * False: compute redundancy for all the pairs of segments, time complexity :math:`O(top\_k * n\_segments^2 * n\_features * history\_len)` relevance_aggregation_mode: the method for relevance values per-segment aggregation redundancy_aggregation_mode: diff --git a/tests/test_analysis/test_feature_selection/test_mrmr.py b/tests/test_analysis/test_feature_selection/test_mrmr.py index bc1d2d3ab..a45190bec 100644 --- a/tests/test_analysis/test_feature_selection/test_mrmr.py +++ b/tests/test_analysis/test_feature_selection/test_mrmr.py @@ -172,5 +172,5 @@ def test_mrmr_select_less_redundant_regressor_diff_start( def test_fast_redundancy_deprecation_warning(df_with_regressors): df, regressors = df_with_regressors["df"], df_with_regressors["regressors"] relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor()) - with pytest.warns(DeprecationWarning): + with pytest.warns(DeprecationWarning, match="Option `fast_redundancy=False` was added for backward compatibility"): mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=False) From 06be67aba5ee0e9354bc97aed8e8f20bead550fb Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Thu, 22 Jun 2023 18:29:40 +0300 Subject: [PATCH 07/10] Fix tests --- .../test_feature_importance_transform.py | 8 +++-- .../test_inference/test_inverse_transform.py | 36 ++++++++++++++----- .../test_inference/test_transform.py | 30 ++++++++++++---- 3 files changed, 56 insertions(+), 18 deletions(-) diff --git a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py index 56fe0ba84..16bafc65b 100644 --- a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py @@ -279,13 +279,14 @@ def test_fit_transform_with_nans(model, ts_diff_endings): selector.fit_transform(ts_diff_endings) +@pytest.mark.parametrize("fast_redundancy", ([True, False])) @pytest.mark.parametrize("relevance_table", ([StatisticsRelevanceTable()])) @pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50]) -def test_mrmr_right_len(relevance_table, top_k, ts_with_regressors): +def test_mrmr_right_len(relevance_table, top_k, ts_with_regressors, fast_redundancy): """Check that transform selects exactly top_k regressors.""" all_regressors = ts_with_regressors.regressors ts = ts_with_regressors - mrmr = MRMRFeatureSelectionTransform(relevance_table=relevance_table, top_k=top_k) + mrmr = MRMRFeatureSelectionTransform(relevance_table=relevance_table, top_k=top_k, fast_redundancy=fast_redundancy) df_selected = mrmr.fit_transform(ts).to_pandas() selected_regressors = set() @@ -316,7 +317,8 @@ def test_mrmr_right_regressors(relevance_table, ts_with_regressors): MRMRFeatureSelectionTransform( relevance_table=ModelRelevanceTable(), top_k=3, model=RandomForestRegressor(random_state=42) ), - MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3), + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=True), + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=False), ], ) def test_save_load(transform, ts_with_regressors): diff --git a/tests/test_transforms/test_inference/test_inverse_transform.py b/tests/test_transforms/test_inference/test_inverse_transform.py index 0e23db17b..e1a45c7d2 100644 --- a/tests/test_transforms/test_inference/test_inverse_transform.py +++ b/tests/test_transforms/test_inference/test_inverse_transform.py @@ -125,7 +125,12 @@ def _test_inverse_transform_train_subset_segments(self, ts, transform, segments) # feature_selection (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog"), (GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), - (MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), + "ts_with_exog", + ), (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), # math (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), @@ -296,7 +301,12 @@ def _test_inverse_transform_future_subset_segments(self, ts, transform, segments # feature_selection (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog"), (GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), - (MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), + "ts_with_exog", + ), (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), # math (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), @@ -473,13 +483,15 @@ def _test_inverse_transform_train_new_segments(self, ts, transform, train_segmen {"create": {"year", "weekday", "month"}}, ), ( - MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), "ts_with_exog", {}, ), ( MRMRFeatureSelectionTransform( - relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=True ), "ts_with_exog", {"create": {"monthday", "positive", "weekday"}}, @@ -762,7 +774,9 @@ def _test_inverse_transform_future_new_segments(self, ts, transform, train_segme {}, ), ( - MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), "ts_with_exog", {}, ), @@ -1042,7 +1056,7 @@ def test_inverse_transform_future_new_segments_not_implemented(self, transform, ), ( MRMRFeatureSelectionTransform( - relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=True ), "ts_with_exog", {"create": {"positive", "monthday", "weekday"}}, @@ -1166,7 +1180,9 @@ def _test_inverse_transform_future_with_target( {"create": {"month", "year", "positive"}}, ), ( - MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), "ts_with_exog", {}, ), @@ -1539,7 +1555,9 @@ def _test_inverse_transform_future_without_target( {}, ), ( - MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), "ts_with_exog", {}, ), @@ -1852,7 +1870,7 @@ def test_inverse_transform_future_without_target_fail_resample( ), ( MRMRFeatureSelectionTransform( - relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=True ), "ts_with_exog", {"create": {"weekday", "monthday", "positive"}}, diff --git a/tests/test_transforms/test_inference/test_transform.py b/tests/test_transforms/test_inference/test_transform.py index d601f98d5..fc70d7399 100644 --- a/tests/test_transforms/test_inference/test_transform.py +++ b/tests/test_transforms/test_inference/test_transform.py @@ -119,7 +119,12 @@ def _test_transform_train_subset_segments(self, ts, transform, segments): # feature_selection (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog"), (GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), - (MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), + "ts_with_exog", + ), (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), # math (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), @@ -280,7 +285,12 @@ def _test_transform_future_subset_segments(self, ts, transform, segments, horizo # feature_selection (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog"), (GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), - (MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), "ts_with_exog"), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), + "ts_with_exog", + ), (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), # math (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), @@ -447,7 +457,9 @@ def _test_transform_train_new_segments(self, ts, transform, train_segments, expe {"remove": {"weekday", "year", "month"}}, ), ( - MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), "ts_with_exog", {"remove": {"weekday", "monthday", "positive"}}, ), @@ -715,7 +727,9 @@ def _test_transform_future_new_segments(self, ts, transform, train_segments, exp {"remove": {"weekday", "year", "month"}}, ), ( - MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), "ts_with_exog", {"remove": {"weekday", "monthday", "positive"}}, ), @@ -1061,7 +1075,9 @@ def _test_transform_future_with_target(self, ts, transform, expected_changes, ga {"remove": {"month", "year", "positive"}}, ), ( - MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), "ts_with_exog", {"remove": {"weekday", "monthday", "positive"}}, ), @@ -1380,7 +1396,9 @@ def _test_transform_future_without_target(self, ts, transform, expected_changes, {"remove": {"month", "year", "weekday"}}, ), ( - MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=2), + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True + ), "ts_with_exog", {"remove": {"weekday", "monthday", "positive"}}, ), From 6566ee2017c6c55da8025373e320fee50505ca42 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Fri, 23 Jun 2023 08:09:21 +0300 Subject: [PATCH 08/10] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ccaf1a93d..716e3aac1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add tuning stage into `Auto.fit` ([#1272](https://github.com/tinkoff-ai/etna/pull/1272)) - Add `params_to_tune` into `Tune` init ([#1282](https://github.com/tinkoff-ai/etna/pull/1282)) - Skip duplicates during `Tune.fit`, skip duplicates in `top_k`, add AutoML notebook ([#1285](https://github.com/tinkoff-ai/etna/pull/1285)) +- Add parameter `fast_redundancy` in `mrmm`, fix relevance calculation in `get_model_relevance_table` ([#1294](https://github.com/tinkoff-ai/etna/pull/1294)) + ### Fixed - Fix `plot_backtest` and `plot_backtest_interactive` on one-step forecast ([1260](https://github.com/tinkoff-ai/etna/pull/1260)) - Fix `BaseReconciliator` to work on `pandas==1.1.5` ([#1229](https://github.com/tinkoff-ai/etna/pull/1229)) From c3c59ebf8429f003bb0fd4b4bb576146b042f899 Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Mon, 3 Jul 2023 12:05:50 +0300 Subject: [PATCH 09/10] Review fixes --- .../test_feature_importance_transform.py | 10 ++- .../test_inference/test_inverse_transform.py | 70 +++++++++++++++++++ .../test_inference/test_transform.py | 40 +++++++++++ 3 files changed, 117 insertions(+), 3 deletions(-) diff --git a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py index 16bafc65b..ea6170b82 100644 --- a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py @@ -297,11 +297,14 @@ def test_mrmr_right_len(relevance_table, top_k, ts_with_regressors, fast_redunda assert len(selected_regressors) == min(len(all_regressors), top_k) +@pytest.mark.parametrize("fast_redundancy", ([True, False])) @pytest.mark.parametrize("relevance_table", ([ModelRelevanceTable()])) -def test_mrmr_right_regressors(relevance_table, ts_with_regressors): +def test_mrmr_right_regressors(relevance_table, ts_with_regressors, fast_redundancy): """Check that transform selects right top_k regressors.""" ts = ts_with_regressors - mrmr = MRMRFeatureSelectionTransform(relevance_table=relevance_table, top_k=3, model=RandomForestRegressor()) + mrmr = MRMRFeatureSelectionTransform( + relevance_table=relevance_table, top_k=3, model=RandomForestRegressor(), fast_redundancy=fast_redundancy + ) df_selected = mrmr.fit_transform(ts).to_pandas() selected_regressors = set() for column in df_selected.columns.get_level_values("feature"): @@ -332,7 +335,8 @@ def test_save_load(transform, ts_with_regressors): MRMRFeatureSelectionTransform( relevance_table=ModelRelevanceTable(), top_k=3, model=RandomForestRegressor(random_state=42) ), - MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3), + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=True), + MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=False), ], ) def test_params_to_tune(transform, ts_with_regressors): diff --git a/tests/test_transforms/test_inference/test_inverse_transform.py b/tests/test_transforms/test_inference/test_inverse_transform.py index e1a45c7d2..564ff8ba2 100644 --- a/tests/test_transforms/test_inference/test_inverse_transform.py +++ b/tests/test_transforms/test_inference/test_inverse_transform.py @@ -131,6 +131,14 @@ def _test_inverse_transform_train_subset_segments(self, ts, transform, segments) ), "ts_with_exog", ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), + top_k=2, + fast_redundancy=GaleShapleyFeatureSelectionTransform, + ), + "ts_with_exog", + ), (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), # math (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), @@ -307,6 +315,12 @@ def _test_inverse_transform_future_subset_segments(self, ts, transform, segments ), "ts_with_exog", ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + ), + "ts_with_exog", + ), (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), # math (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), @@ -489,6 +503,13 @@ def _test_inverse_transform_train_new_segments(self, ts, transform, train_segmen "ts_with_exog", {}, ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + ), + "ts_with_exog", + {}, + ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=True @@ -496,6 +517,13 @@ def _test_inverse_transform_train_new_segments(self, ts, transform, train_segmen "ts_with_exog", {"create": {"monthday", "positive", "weekday"}}, ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False + ), + "ts_with_exog", + {"create": {"monthday", "positive", "weekday"}}, + ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog", @@ -780,6 +808,13 @@ def _test_inverse_transform_future_new_segments(self, ts, transform, train_segme "ts_with_exog", {}, ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + ), + "ts_with_exog", + {}, + ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog", @@ -1061,6 +1096,13 @@ def test_inverse_transform_future_new_segments_not_implemented(self, transform, "ts_with_exog", {"create": {"positive", "monthday", "weekday"}}, ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False + ), + "ts_with_exog", + {"create": {"positive", "monthday", "weekday"}}, + ), ( TreeFeatureSelectionTransform( model=DecisionTreeRegressor(random_state=42), top_k=2, return_features=True @@ -1193,6 +1235,20 @@ def _test_inverse_transform_future_with_target( "ts_with_exog", {"create": {"weekday", "monthday", "positive"}}, ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + ), + "ts_with_exog", + {}, + ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=False + ), + "ts_with_exog", + {"create": {"weekday", "monthday", "positive"}}, + ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog", @@ -1561,6 +1617,13 @@ def _test_inverse_transform_future_without_target( "ts_with_exog", {}, ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + ), + "ts_with_exog", + {}, + ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog", @@ -1875,6 +1938,13 @@ def test_inverse_transform_future_without_target_fail_resample( "ts_with_exog", {"create": {"weekday", "monthday", "positive"}}, ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False + ), + "ts_with_exog", + {"create": {"weekday", "monthday", "positive"}}, + ), ( TreeFeatureSelectionTransform( model=DecisionTreeRegressor(random_state=42), top_k=2, return_features=True diff --git a/tests/test_transforms/test_inference/test_transform.py b/tests/test_transforms/test_inference/test_transform.py index fc70d7399..5942dde87 100644 --- a/tests/test_transforms/test_inference/test_transform.py +++ b/tests/test_transforms/test_inference/test_transform.py @@ -125,6 +125,12 @@ def _test_transform_train_subset_segments(self, ts, transform, segments): ), "ts_with_exog", ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + ), + "ts_with_exog", + ), (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), # math (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), @@ -291,6 +297,12 @@ def _test_transform_future_subset_segments(self, ts, transform, segments, horizo ), "ts_with_exog", ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + ), + "ts_with_exog", + ), (TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog"), # math (AddConstTransform(in_column="target", value=1, inplace=False), "regular_ts"), @@ -463,6 +475,13 @@ def _test_transform_train_new_segments(self, ts, transform, train_segments, expe "ts_with_exog", {"remove": {"weekday", "monthday", "positive"}}, ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + ), + "ts_with_exog", + {"remove": {"weekday", "monthday", "positive"}}, + ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog", @@ -733,6 +752,13 @@ def _test_transform_future_new_segments(self, ts, transform, train_segments, exp "ts_with_exog", {"remove": {"weekday", "monthday", "positive"}}, ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + ), + "ts_with_exog", + {"remove": {"weekday", "monthday", "positive"}}, + ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog", @@ -1081,6 +1107,13 @@ def _test_transform_future_with_target(self, ts, transform, expected_changes, ga "ts_with_exog", {"remove": {"weekday", "monthday", "positive"}}, ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + ), + "ts_with_exog", + {"remove": {"weekday", "monthday", "positive"}}, + ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog", @@ -1402,6 +1435,13 @@ def _test_transform_future_without_target(self, ts, transform, expected_changes, "ts_with_exog", {"remove": {"weekday", "monthday", "positive"}}, ), + ( + MRMRFeatureSelectionTransform( + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + ), + "ts_with_exog", + {"remove": {"weekday", "monthday", "positive"}}, + ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), "ts_with_exog", From e7d8a7596f3d2428162d7db7ee343e3f8344c9ce Mon Sep 17 00:00:00 2001 From: alex-hse-repository Date: Mon, 3 Jul 2023 13:07:15 +0300 Subject: [PATCH 10/10] Fix parameter passing --- .../test_inference/test_inverse_transform.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/test_transforms/test_inference/test_inverse_transform.py b/tests/test_transforms/test_inference/test_inverse_transform.py index 564ff8ba2..e1aee98ba 100644 --- a/tests/test_transforms/test_inference/test_inverse_transform.py +++ b/tests/test_transforms/test_inference/test_inverse_transform.py @@ -1230,21 +1230,24 @@ def _test_inverse_transform_future_with_target( ), ( MRMRFeatureSelectionTransform( - relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True + relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False ), "ts_with_exog", - {"create": {"weekday", "monthday", "positive"}}, + {}, ), ( MRMRFeatureSelectionTransform( - relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False + relevance_table=StatisticsRelevanceTable(), + top_k=2, + return_features=True, + fast_redundancy=True, ), "ts_with_exog", - {}, + {"create": {"weekday", "monthday", "positive"}}, ), ( MRMRFeatureSelectionTransform( - relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=False + relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False ), "ts_with_exog", {"create": {"weekday", "monthday", "positive"}},