Skip to content

Speed up feature selection #1294

Merged
merged 13 commits into from
Jul 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add tuning stage into `Auto.fit` ([#1272](https://github.com/tinkoff-ai/etna/pull/1272))
- Add `params_to_tune` into `Tune` init ([#1282](https://github.com/tinkoff-ai/etna/pull/1282))
- Skip duplicates during `Tune.fit`, skip duplicates in `top_k`, add AutoML notebook ([#1285](https://github.com/tinkoff-ai/etna/pull/1285))
- Add parameter `fast_redundancy` in `mrmm`, fix relevance calculation in `get_model_relevance_table` ([#1294](https://github.com/tinkoff-ai/etna/pull/1294))

### Fixed
- Fix `plot_backtest` and `plot_backtest_interactive` on one-step forecast ([1260](https://github.com/tinkoff-ai/etna/pull/1260))
- Fix `BaseReconciliator` to work on `pandas==1.1.5` ([#1229](https://github.com/tinkoff-ai/etna/pull/1229))
Expand Down
9 changes: 7 additions & 2 deletions etna/analysis/feature_relevance/relevance_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def _prepare_df(df: pd.DataFrame, df_exog: pd.DataFrame, segment: str, regressor

def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> pd.DataFrame:
"""Calculate relevance table with p-values from tsfresh.

Parameters
----------
df:
Expand All @@ -48,6 +47,10 @@ def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> p
-------
pd.DataFrame
dataframe with p-values.

Notes
-----
Time complexity of this method is :math:`O(n\_segments * n\_features * history\_len)`
"""
regressors = sorted(df_exog.columns.get_level_values("feature").unique())
segments = sorted(df.columns.get_level_values("segment").unique())
Expand All @@ -64,7 +67,9 @@ def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> p
"Exogenous data contains columns with category type! It will be converted to float. If this is not desired behavior, use encoders."
)

relevance = calculate_relevance_table(X=df_exog_seg, y=df_seg)[["feature", "p_value"]].values
relevance = calculate_relevance_table(X=df_exog_seg, y=df_seg, ml_task="regression")[
["feature", "p_value"]
].values
result[k] = np.array(sorted(relevance, key=lambda x: x[0]))[:, 1]
relevance_table = pd.DataFrame(result)
relevance_table.index = segments
Expand Down
38 changes: 31 additions & 7 deletions etna/analysis/feature_selection/mrmr_selection.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from enum import Enum
from typing import List

Expand Down Expand Up @@ -26,6 +27,7 @@ def mrmr(
relevance_table: pd.DataFrame,
regressors: pd.DataFrame,
top_k: int,
fast_redundancy: bool = False,
relevance_aggregation_mode: str = AggregationMode.mean,
redundancy_aggregation_mode: str = AggregationMode.mean,
atol: float = 1e-10,
Expand All @@ -47,6 +49,9 @@ def mrmr(
dataframe with regressors in etna format
top_k:
num of regressors to select; if there are not enough regressors, then all will be selected
fast_redundancy:
* True: compute redundancy only inside the the segments, time complexity :math:`O(top\_k * n\_segments * n\_features * history\_len)`
* False: compute redundancy for all the pairs of segments, time complexity :math:`O(top\_k * n\_segments^2 * n\_features * history\_len)`
relevance_aggregation_mode:
the method for relevance values per-segment aggregation
redundancy_aggregation_mode:
Expand All @@ -59,12 +64,18 @@ def mrmr(
selected_features: List[str]
list of ``top_k`` selected regressors, sorted by their importance
"""
if not fast_redundancy:
warnings.warn(
"Option `fast_redundancy=False` was added for backward compatibility and will be removed in etna 3.0.0.",
DeprecationWarning,
)
relevance_aggregation_fn = AGGREGATION_FN[AggregationMode(relevance_aggregation_mode)]
redundancy_aggregation_fn = AGGREGATION_FN[AggregationMode(redundancy_aggregation_mode)]

relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0)

all_features = relevance.index.to_list()
segments = set(regressors.columns.get_level_values("segment"))
selected_features: List[str] = []
not_selected_features = all_features.copy()

Expand All @@ -76,16 +87,29 @@ def mrmr(
score_denominator = pd.Series(1, index=not_selected_features)
if i > 0:
last_selected_feature = selected_features[-1]
not_selected_regressors = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, not_selected_features]]
last_selected_regressor = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, last_selected_feature]]
not_selected_regressors = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, not_selected_features]]

if fast_redundancy:
segment_redundancy = pd.concat(
[
not_selected_regressors[segment].apply(
lambda col: last_selected_regressor[segment].corrwith(col) # noqa: B023
)
for segment in segments
]
).abs()
else:
segment_redundancy = (
not_selected_regressors.apply(lambda col: last_selected_regressor.corrwith(col)) # noqa: B023
.abs()
.groupby("feature")
.apply(redundancy_aggregation_fn)
.T.groupby("feature")
)

redundancy_table.loc[not_selected_features, last_selected_feature] = (
not_selected_regressors.apply(lambda col: last_selected_regressor.corrwith(col)) # noqa: B023
.abs()
.groupby("feature")
.apply(redundancy_aggregation_fn)
.T.groupby("feature")
.apply(redundancy_aggregation_fn)
segment_redundancy.apply(redundancy_aggregation_fn)
.clip(atol)
.fillna(np.inf)
.loc[not_selected_features]
Expand Down
6 changes: 6 additions & 0 deletions etna/transforms/feature_selection/feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ def __init__(
relevance_table: RelevanceTable,
top_k: int,
features_to_use: Union[List[str], Literal["all"]] = "all",
fast_redundancy: bool = False,
relevance_aggregation_mode: str = AggregationMode.mean,
redundancy_aggregation_mode: str = AggregationMode.mean,
atol: float = 1e-10,
Expand All @@ -189,6 +190,9 @@ def __init__(
features_to_use:
columns of the dataset to select from
if "all" value is given, all columns are used
fast_redundancy:
* True: compute redundancy only inside the the segments, time complexity :math:`O(top\_k * n\_segments * n\_features * history\_len)
* False: compute redundancy for all the pairs of segments, time complexity :math:`O(top\_k * n\_segments^2 * n\_features * history\_len)`
relevance_aggregation_mode:
the method for relevance values per-segment aggregation
redundancy_aggregation_mode:
Expand All @@ -204,6 +208,7 @@ def __init__(
super().__init__(features_to_use=features_to_use, return_features=return_features)
self.relevance_table = relevance_table
self.top_k = top_k
self.fast_redundancy = fast_redundancy
self.relevance_aggregation_mode = relevance_aggregation_mode
self.redundancy_aggregation_mode = redundancy_aggregation_mode
self.atol = atol
Expand Down Expand Up @@ -232,6 +237,7 @@ def _fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform":
relevance_table=relevance_table,
regressors=ts[:, :, features],
top_k=self.top_k,
fast_redundancy=self.fast_redundancy,
relevance_aggregation_mode=self.relevance_aggregation_mode,
redundancy_aggregation_mode=self.redundancy_aggregation_mode,
atol=self.atol,
Expand Down
41 changes: 33 additions & 8 deletions tests/test_analysis/test_feature_selection/test_mrmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,29 +58,39 @@ def df_with_regressors() -> Dict[str, pd.DataFrame]:
}


@pytest.mark.parametrize("fast_redundancy", [True, False])
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize(
"relevance_method, expected_regressors",
[(ModelRelevanceTable(), ["regressor_useful_0", "regressor_useful_1", "regressor_useful_2"])],
)
def test_mrmr_right_regressors(df_with_regressors, relevance_method, expected_regressors):
def test_mrmr_right_regressors(df_with_regressors, relevance_method, expected_regressors, fast_redundancy):
relevance_table = relevance_method(
df=df_with_regressors["target"], df_exog=df_with_regressors["regressors"], model=RandomForestRegressor()
)
selected_regressors = mrmr(relevance_table=relevance_table, regressors=df_with_regressors["regressors"], top_k=3)
selected_regressors = mrmr(
relevance_table=relevance_table,
regressors=df_with_regressors["regressors"],
top_k=3,
fast_redundancy=fast_redundancy,
)
assert set(selected_regressors) == set(expected_regressors)


def test_mrmr_not_depend_on_columns_order(df_with_regressors):
@pytest.mark.parametrize("fast_redundancy", [True, False])
def test_mrmr_not_depend_on_columns_order(df_with_regressors, fast_redundancy):
df, regressors = df_with_regressors["df"], df_with_regressors["regressors"]
relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
expected_answer = mrmr(relevance_table=relevance_table, regressors=regressors, top_k=5)
expected_answer = mrmr(
relevance_table=relevance_table, regressors=regressors, top_k=5, fast_redundancy=fast_redundancy
)
columns = list(regressors.columns.get_level_values("feature").unique())
for i in range(10):
np.random.shuffle(columns)
answer = mrmr(
relevance_table=relevance_table[columns],
regressors=regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, columns]],
top_k=5,
fast_redundancy=fast_redundancy,
)
assert answer == expected_answer

Expand Down Expand Up @@ -131,21 +141,36 @@ def high_relevance_high_redundancy_problem_diff_starts(periods=10):
}


def test_mrmr_select_less_redundant_regressor(high_relevance_high_redundancy_problem):
@pytest.mark.parametrize("fast_redundancy", [True, False])
def test_mrmr_select_less_redundant_regressor(high_relevance_high_redundancy_problem, fast_redundancy):
"""Check that transform selects the less redundant regressor out of regressors with same relevance."""
relevance_table, regressors = (
high_relevance_high_redundancy_problem["relevance_table"],
high_relevance_high_redundancy_problem["regressors"],
)
selected_regressors = mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2)
selected_regressors = mrmr(
relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=fast_redundancy
)
assert set(selected_regressors) == set(high_relevance_high_redundancy_problem["expected_answer"])


def test_mrmr_select_less_redundant_regressor_diff_start(high_relevance_high_redundancy_problem_diff_starts):
@pytest.mark.parametrize("fast_redundancy", [True, False])
def test_mrmr_select_less_redundant_regressor_diff_start(
high_relevance_high_redundancy_problem_diff_starts, fast_redundancy
):
"""Check that transform selects the less redundant regressor out of regressors with same relevance."""
relevance_table, regressors = (
high_relevance_high_redundancy_problem_diff_starts["relevance_table"],
high_relevance_high_redundancy_problem_diff_starts["regressors"],
)
selected_regressors = mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2)
selected_regressors = mrmr(
relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=fast_redundancy
)
assert set(selected_regressors) == set(high_relevance_high_redundancy_problem_diff_starts["expected_answer"])


def test_fast_redundancy_deprecation_warning(df_with_regressors):
df, regressors = df_with_regressors["df"], df_with_regressors["regressors"]
relevance_table = ModelRelevanceTable()(df=df, df_exog=regressors, model=RandomForestRegressor())
with pytest.warns(DeprecationWarning, match="Option `fast_redundancy=False` was added for backward compatibility"):
mrmr(relevance_table=relevance_table, regressors=regressors, top_k=2, fast_redundancy=False)
Original file line number Diff line number Diff line change
Expand Up @@ -279,13 +279,14 @@ def test_fit_transform_with_nans(model, ts_diff_endings):
selector.fit_transform(ts_diff_endings)


@pytest.mark.parametrize("fast_redundancy", ([True, False]))
@pytest.mark.parametrize("relevance_table", ([StatisticsRelevanceTable()]))
@pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50])
def test_mrmr_right_len(relevance_table, top_k, ts_with_regressors):
def test_mrmr_right_len(relevance_table, top_k, ts_with_regressors, fast_redundancy):
"""Check that transform selects exactly top_k regressors."""
all_regressors = ts_with_regressors.regressors
ts = ts_with_regressors
mrmr = MRMRFeatureSelectionTransform(relevance_table=relevance_table, top_k=top_k)
mrmr = MRMRFeatureSelectionTransform(relevance_table=relevance_table, top_k=top_k, fast_redundancy=fast_redundancy)
df_selected = mrmr.fit_transform(ts).to_pandas()

selected_regressors = set()
Expand All @@ -296,11 +297,14 @@ def test_mrmr_right_len(relevance_table, top_k, ts_with_regressors):
assert len(selected_regressors) == min(len(all_regressors), top_k)


@pytest.mark.parametrize("fast_redundancy", ([True, False]))
@pytest.mark.parametrize("relevance_table", ([ModelRelevanceTable()]))
def test_mrmr_right_regressors(relevance_table, ts_with_regressors):
def test_mrmr_right_regressors(relevance_table, ts_with_regressors, fast_redundancy):
"""Check that transform selects right top_k regressors."""
ts = ts_with_regressors
mrmr = MRMRFeatureSelectionTransform(relevance_table=relevance_table, top_k=3, model=RandomForestRegressor())
mrmr = MRMRFeatureSelectionTransform(
relevance_table=relevance_table, top_k=3, model=RandomForestRegressor(), fast_redundancy=fast_redundancy
)
df_selected = mrmr.fit_transform(ts).to_pandas()
selected_regressors = set()
for column in df_selected.columns.get_level_values("feature"):
Expand All @@ -316,7 +320,8 @@ def test_mrmr_right_regressors(relevance_table, ts_with_regressors):
MRMRFeatureSelectionTransform(
relevance_table=ModelRelevanceTable(), top_k=3, model=RandomForestRegressor(random_state=42)
),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=True),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=False),
],
)
def test_save_load(transform, ts_with_regressors):
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -330,7 +335,8 @@ def test_save_load(transform, ts_with_regressors):
MRMRFeatureSelectionTransform(
relevance_table=ModelRelevanceTable(), top_k=3, model=RandomForestRegressor(random_state=42)
),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=True),
MRMRFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=3, fast_redundancy=False),
],
)
def test_params_to_tune(transform, ts_with_regressors):
Expand Down
Loading