Skip to content

Commit 5c32e64

Browse files
martinbomiopark12sj
authored andcommitted
[data/preprocessors] feat: allow simple imputer to execute on append mode (ray-project#50713)
<!-- Thank you for your contribution! Please review https://github.com/ray-project/ray/blob/master/CONTRIBUTING.rst before opening a pull request. --> <!-- Please add a reviewer to the assignee section when you create a PR. If you don't have the access to it, we will shortly find a reviewer and assign them to your PR. --> ## Why are these changes needed? This is part of ray-project#48133. Continuing the approach taken in ray-project#49426, make all the simple imputer work in append mode ## Related issue number ray-project#48133 ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [x] I've included any doc changes needed for https://docs.ray.io/en/master/. - [x] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Unit tests - [ ] Release tests - [ ] This PR is not tested :( Signed-off-by: Martin Bomio <[email protected]>
1 parent 6dc5446 commit 5c32e64

File tree

2 files changed

+70
-10
lines changed

2 files changed

+70
-10
lines changed

python/ray/data/preprocessors/imputer.py

+29-5
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,17 @@ class SimpleImputer(Preprocessor):
6666
2 3.0 c
6767
3 3.0 c
6868
69+
:class:`SimpleImputer` can also be used in append mode by providing the
70+
name of the output_columns that should hold the imputed values.
71+
72+
>>> preprocessor = SimpleImputer(columns=["X"], output_columns=["X_imputed"], strategy="mean")
73+
>>> preprocessor.fit_transform(ds).to_pandas() # doctest: +SKIP
74+
X Y X_imputed
75+
0 0.0 None 0.0
76+
1 NaN b 2.0
77+
2 3.0 c 3.0
78+
3 3.0 c 3.0
79+
6980
Args:
7081
columns: The columns to apply imputation to.
7182
strategy: How imputed values are chosen.
@@ -75,6 +86,10 @@ class SimpleImputer(Preprocessor):
7586
* ``"constant"``: The value passed to ``fill_value``.
7687
7788
fill_value: The value to use when ``strategy`` is ``"constant"``.
89+
output_columns: The names of the transformed columns. If None, the transformed
90+
columns will be the same as the input columns. If not None, the length of
91+
``output_columns`` must match the length of ``columns``, othwerwise an error
92+
will be raised.
7893
7994
Raises:
8095
ValueError: if ``strategy`` is not ``"mean"``, ``"most_frequent"``, or
@@ -88,6 +103,8 @@ def __init__(
88103
columns: List[str],
89104
strategy: str = "mean",
90105
fill_value: Optional[Union[str, Number]] = None,
106+
*,
107+
output_columns: Optional[List[str]] = None,
91108
):
92109
self.columns = columns
93110
self.strategy = strategy
@@ -107,6 +124,10 @@ def __init__(
107124
'`fill_value` must be set when using "constant" strategy.'
108125
)
109126

127+
self.output_columns = Preprocessor._derive_and_validate_output_columns(
128+
columns, output_columns
129+
)
130+
110131
def _fit(self, dataset: Dataset) -> Preprocessor:
111132
if self.strategy == "mean":
112133
aggregates = [Mean(col) for col in self.columns]
@@ -117,7 +138,7 @@ def _fit(self, dataset: Dataset) -> Preprocessor:
117138
return self
118139

119140
def _transform_pandas(self, df: pd.DataFrame):
120-
for column in self.columns:
141+
for column, output_column in zip(self.columns, self.output_columns):
121142
value = self._get_fill_value(column)
122143

123144
if value is None:
@@ -128,11 +149,13 @@ def _transform_pandas(self, df: pd.DataFrame):
128149

129150
if column not in df.columns:
130151
# Create the column with the fill_value if it doesn't exist
131-
df[column] = value
152+
df[output_column] = value
132153
else:
133154
if is_categorical_dtype(df.dtypes[column]):
134-
df[column] = df[column].cat.add_categories([value])
135-
df[column].fillna(value, inplace=True)
155+
df[output_column] = df[column].cat.add_categories([value])
156+
if output_column != column:
157+
df[output_column] = df[column].copy(deep=True)
158+
df[output_column].fillna(value, inplace=True)
136159

137160
return df
138161

@@ -152,7 +175,8 @@ def _get_fill_value(self, column):
152175
def __repr__(self):
153176
return (
154177
f"{self.__class__.__name__}(columns={self.columns!r}, "
155-
f"strategy={self.strategy!r}, fill_value={self.fill_value!r})"
178+
f"strategy={self.strategy!r}, fill_value={self.fill_value!r}, "
179+
f"output_columns={self.output_columns!r})"
156180
)
157181

158182

python/ray/data/tests/preprocessors/test_imputer.py

+41-5
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def test_simple_imputer():
3636
{"A": processed_col_a, "B": processed_col_b, "C": processed_col_c}
3737
)
3838

39-
assert out_df.equals(expected_df)
39+
pd.testing.assert_frame_equal(out_df, expected_df)
4040

4141
# Transform batch.
4242
pred_col_a = [1, 2, np.nan]
@@ -59,7 +59,7 @@ def test_simple_imputer():
5959
}
6060
)
6161

62-
assert pred_out_df.equals(pred_expected_df)
62+
pd.testing.assert_frame_equal(pred_out_df, pred_expected_df, check_like=True)
6363

6464
# with missing column
6565
pred_in_df = pd.DataFrame.from_dict({"A": pred_col_a, "B": pred_col_b})
@@ -71,7 +71,39 @@ def test_simple_imputer():
7171
"C": pred_processed_col_c,
7272
}
7373
)
74-
assert pred_out_df.equals(pred_expected_df)
74+
pd.testing.assert_frame_equal(pred_out_df, pred_expected_df, check_like=True)
75+
76+
# append mode
77+
with pytest.raises(ValueError):
78+
SimpleImputer(columns=["B", "C"], output_columns=["B_encoded"])
79+
80+
imputer = SimpleImputer(
81+
columns=["B", "C"],
82+
output_columns=["B_imputed", "C_imputed"],
83+
)
84+
imputer.fit(ds)
85+
86+
pred_col_a = [1, 2, np.nan]
87+
pred_col_b = [1, 2, np.nan]
88+
pred_col_c = [None, None, None]
89+
pred_in_df = pd.DataFrame.from_dict(
90+
{"A": pred_col_a, "B": pred_col_b, "C": pred_col_c}
91+
)
92+
pred_out_df = imputer.transform_batch(pred_in_df)
93+
94+
pred_processed_col_b = [1.0, 2.0, 2.0]
95+
pred_processed_col_c = [1.0, 1.0, 1.0]
96+
pred_expected_df = pd.DataFrame.from_dict(
97+
{
98+
"A": pred_col_a,
99+
"B": pred_col_b,
100+
"C": pred_col_c,
101+
"B_imputed": pred_processed_col_b,
102+
"C_imputed": pred_processed_col_c,
103+
}
104+
)
105+
106+
pd.testing.assert_frame_equal(pred_out_df, pred_expected_df, check_like=True)
75107

76108
# Test "most_frequent" strategy.
77109
most_frequent_col_a = [1, 2, 2, None, None, None]
@@ -97,7 +129,9 @@ def test_simple_imputer():
97129
{"A": most_frequent_processed_col_a, "B": most_frequent_processed_col_b}
98130
)
99131

100-
assert most_frequent_out_df.equals(most_frequent_expected_df)
132+
pd.testing.assert_frame_equal(
133+
most_frequent_out_df, most_frequent_expected_df, check_like=True
134+
)
101135

102136
# Test "constant" strategy.
103137
constant_col_a = ["apple", None]
@@ -123,7 +157,9 @@ def test_simple_imputer():
123157
)
124158
constant_expected_df["B"] = constant_expected_df["B"].astype("category")
125159

126-
assert constant_out_df.equals(constant_expected_df)
160+
pd.testing.assert_frame_equal(
161+
constant_out_df, constant_expected_df, check_like=True
162+
)
127163

128164

129165
def test_imputer_all_nan_raise_error():

0 commit comments

Comments
 (0)