From a52e7fed90b025e7826bc44d4f22a0262edceed0 Mon Sep 17 00:00:00 2001 From: Kei Date: Wed, 17 Apr 2024 17:01:13 +0800 Subject: [PATCH 01/16] Temporarily change observed=True, for groupby.transform --- pandas/core/groupby/generic.py | 3 ++ pandas/core/groupby/groupby.py | 80 +++++++++++++++++++++++++++------- 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0a048d11d0b4d..23b785f282ca1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2044,8 +2044,11 @@ def _gotitem(self, key, ndim: int, subset=None): elif ndim == 1: if subset is None: subset = self.obj[key] + + orig_obj = self.orig_obj if not self.observed else None return SeriesGroupBy( subset, + orig_obj, self.keys, level=self.level, grouper=self._grouper, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bc37405b25a16..ec6c92792dd2e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1096,6 +1096,7 @@ class GroupBy(BaseGroupBy[NDFrameT]): def __init__( self, obj: NDFrameT, + orig_obj: NDFrameT | None = None, keys: _KeysArgType | None = None, level: IndexLabel | None = None, grouper: ops.BaseGrouper | None = None, @@ -1117,6 +1118,7 @@ def __init__( self.sort = sort self.group_keys = group_keys self.dropna = dropna + self.orig_obj = obj if orig_obj is None else orig_obj if grouper is None: grouper, exclusions, obj = get_grouper( @@ -1879,24 +1881,70 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): else: # i.e. func in base.reduction_kernels + if self.observed: + return self._reduction_kernel_transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) - # GH#30918 Use _transform_fast only when we know func is an aggregation - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - with com.temp_setattr(self, "as_index", True): - # GH#49834 - result needs groups in the index for - # _wrap_transform_fast_result - if func in ["idxmin", "idxmax"]: - func = cast(Literal["idxmin", "idxmax"], func) - result = self._idxmax_idxmin(func, True, *args, **kwargs) - else: - if engine is not None: - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - result = getattr(self, func)(*args, **kwargs) + grouper, exclusions, obj = get_grouper( + self.orig_obj, + self.keys, + level=self.level, + sort=self.sort, + observed=True, + dropna=self.dropna, + ) + exclusions = frozenset(exclusions) if exclusions else frozenset() + obj_has_not_changed = self.orig_obj.equals(self.obj) + + with ( + com.temp_setattr(self, "observed", True), + com.temp_setattr(self, "_grouper", grouper), + com.temp_setattr(self, "exclusions", exclusions), + com.temp_setattr(self, "obj", obj, condition=obj_has_not_changed), + ): + return self._reduction_kernel_transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) + + # with com.temp_setattr(self, "as_index", True): + # # GH#49834 - result needs groups in the index for + # # _wrap_transform_fast_result + # if func in ["idxmin", "idxmax"]: + # func = cast(Literal["idxmin", "idxmax"], func) + # result = self._idxmax_idxmin(func, True, *args, **kwargs) + # else: + # if engine is not None: + # kwargs["engine"] = engine + # kwargs["engine_kwargs"] = engine_kwargs + # result = getattr(self, func)(*args, **kwargs) + + # print("result with observed = False\n", result.to_string()) + # r = self._wrap_transform_fast_result(result) + # print("reindexed result", r.to_string()) + # return r + + @final + def _reduction_kernel_transform( + self, func, *args, engine=None, engine_kwargs=None, **kwargs + ): + # GH#30918 Use _transform_fast only when we know func is an aggregation + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + with com.temp_setattr(self, "as_index", True): + # GH#49834 - result needs groups in the index for + # _wrap_transform_fast_result + if func in ["idxmin", "idxmax"]: + func = cast(Literal["idxmin", "idxmax"], func) + result = self._idxmax_idxmin(func, True, *args, **kwargs) + else: + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + result = getattr(self, func)(*args, **kwargs) - return self._wrap_transform_fast_result(result) + return self._wrap_transform_fast_result(result) @final def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: From 898fd12fd76aa26cac8ddb9c51511b61a514a13d Mon Sep 17 00:00:00 2001 From: Kei Date: Wed, 17 Apr 2024 17:01:48 +0800 Subject: [PATCH 02/16] Add tests --- .../tests/groupby/transform/test_transform.py | 154 +++++++++++++++++- 1 file changed, 151 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 245fb9c7babd7..6af044d0b0c5a 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1232,9 +1232,9 @@ def test_categorical_and_not_categorical_key(observed): tm.assert_frame_equal(result, expected_explicit) # Series case - result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform( - "sum" - ) + gb = df_with_categorical.groupby(["A", "C"], observed=observed) + gbp = gb["B"] + result = gbp.transform("sum") expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum") tm.assert_series_equal(result, expected) expected_explicit = Series([4, 2, 4], name="B") @@ -1535,3 +1535,151 @@ def test_transform_sum_one_column_with_matching_labels_and_missing_labels(): result = df.groupby(series, as_index=False).transform("sum") expected = DataFrame({"X": [-93203.0, -93203.0, np.nan]}) tm.assert_frame_equal(result, expected) + + +def test_min_one_unobserved_category_no_type_coercion(): + df = DataFrame({"A": Categorical([1, 1, 2], categories=[1, 2, 3]), "B": [3, 4, 5]}) + df["B"] = df["B"].astype("int32") + gb = df.groupby("A", observed=False) + result = gb.transform("min") + + expected = DataFrame({"B": [3, 3, 5]}, dtype="int32") + tm.assert_frame_equal(expected, result) + assert df["B"].dtype == result["B"].dtype + + +def test_min_multiple_unobserved_categories_no_type_coercion(): + df = DataFrame( + { + "X": Categorical( + ["432945", "randomcat", -4325466, "randomcat", -4325466, -4325466], + categories=[ + 1, + "randomcat", + 100, + 333, + "cat43543", + -4325466, + 54665, + -546767, + "432945", + 767076, + ], + ), + "Y": [0, 940645, np.iinfo(np.int64).min, 9449, 100044444, 40], + } + ) + df["Y"] = df["Y"].astype("int64") + + gb = df.groupby("X", observed=False) + result = gb.transform("min") + + expected = DataFrame( + { + "Y": [ + 0, + 9449, + np.iinfo(np.int64).min, + 9449, + np.iinfo(np.int64).min, + np.iinfo(np.int64).min, + ] + }, + dtype="int64", + ) + tm.assert_frame_equal(expected, result) + assert df["Y"].dtype == result["Y"].dtype + + +def test_min_float32_multiple_unobserved_categories_no_type_coercion(): + df = DataFrame( + { + "X": Categorical( + ["cat43543", -4325466, 54665, "cat43543", -4325466, 54665], + categories=[ + 1, + "randomcat", + 100, + 333, + "cat43543", + -4325466, + 54665, + -546767, + "432945", + 767076, + ], + ), + "Y": [ + 0.3940429, + 940645.49, + np.finfo(np.float32).min, + 9449.03333, + 100044444.403294, + 40.3020909, + ], + } + ) + df["Y"] = df["Y"].astype("float32") + + gb = df.groupby("X", observed=False) + result = gb.transform("min") + + expected = DataFrame( + { + "Y": [ + 0.3940429, + 940645.49, + np.finfo(np.float32).min, + 0.3940429, + 940645.49, + np.finfo(np.float32).min, + ] + }, + dtype="float32", + ) + tm.assert_frame_equal(expected, result) + assert df["Y"].dtype == result["Y"].dtype + + +def test_min_all_empty_data_no_type_coercion(): + df = DataFrame( + { + "X": Categorical( + [], + categories=[ + 1, + "randomcat", + 100, + 333, + "cat43543", + -4325466, + 54665, + -546767, + "432945", + 767076, + ], + ), + "Y": [], + } + ) + df["Y"] = df["Y"].astype("int32") + + gb = df.groupby("X", observed=False) + result = gb.transform("min") + + expected = DataFrame({"Y": []}, dtype="int32") + tm.assert_frame_equal(expected, result) + assert df["Y"].dtype == result["Y"].dtype + + +def test_min_one_dim_no_type_coercion(): + df = DataFrame({"Y": [9435, -5465765, 5055, 0, 954960]}) + df["Y"] = df["Y"].astype("int32") + categories = Categorical([1, 2, 2, 5, 1], categories=[1, 2, 3, 4, 5]) + + gb = df.groupby(categories, observed=False) + result = gb.transform("min") + + expected = DataFrame({"Y": [9435, -5465765, -5465765, 0, 9435]}, dtype="int32") + tm.assert_frame_equal(expected, result) + assert df["Y"].dtype == result["Y"].dtype From 5311004474b598c8d46d13dfb17b62d14ec0b2d3 Mon Sep 17 00:00:00 2001 From: Kei Date: Wed, 17 Apr 2024 22:42:13 +0800 Subject: [PATCH 03/16] Add orig_obj in BaseGroupBy hidden attr --- pandas/core/groupby/groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ec6c92792dd2e..c41dfd818b255 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -586,6 +586,7 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): "keys", "level", "obj", + "orig_obj", "observed", "sort", } From fb548ade3fc34cc45ec70392a18ae50a954bd8e4 Mon Sep 17 00:00:00 2001 From: Kei Date: Wed, 17 Apr 2024 22:50:38 +0800 Subject: [PATCH 04/16] Update tests according to pr comments --- pandas/tests/groupby/transform/test_transform.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 6af044d0b0c5a..81fdd49492a66 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1537,6 +1537,7 @@ def test_transform_sum_one_column_with_matching_labels_and_missing_labels(): tm.assert_frame_equal(result, expected) +# GH#58084 def test_min_one_unobserved_category_no_type_coercion(): df = DataFrame({"A": Categorical([1, 1, 2], categories=[1, 2, 3]), "B": [3, 4, 5]}) df["B"] = df["B"].astype("int32") @@ -1545,9 +1546,9 @@ def test_min_one_unobserved_category_no_type_coercion(): expected = DataFrame({"B": [3, 3, 5]}, dtype="int32") tm.assert_frame_equal(expected, result) - assert df["B"].dtype == result["B"].dtype +# GH#58084 def test_min_multiple_unobserved_categories_no_type_coercion(): df = DataFrame( { @@ -1588,9 +1589,9 @@ def test_min_multiple_unobserved_categories_no_type_coercion(): dtype="int64", ) tm.assert_frame_equal(expected, result) - assert df["Y"].dtype == result["Y"].dtype +# GH#58084 def test_min_float32_multiple_unobserved_categories_no_type_coercion(): df = DataFrame( { @@ -1638,9 +1639,9 @@ def test_min_float32_multiple_unobserved_categories_no_type_coercion(): dtype="float32", ) tm.assert_frame_equal(expected, result) - assert df["Y"].dtype == result["Y"].dtype +# GH#58084 def test_min_all_empty_data_no_type_coercion(): df = DataFrame( { @@ -1669,9 +1670,9 @@ def test_min_all_empty_data_no_type_coercion(): expected = DataFrame({"Y": []}, dtype="int32") tm.assert_frame_equal(expected, result) - assert df["Y"].dtype == result["Y"].dtype +# GH#58084 def test_min_one_dim_no_type_coercion(): df = DataFrame({"Y": [9435, -5465765, 5055, 0, 954960]}) df["Y"] = df["Y"].astype("int32") @@ -1682,4 +1683,3 @@ def test_min_one_dim_no_type_coercion(): expected = DataFrame({"Y": [9435, -5465765, -5465765, 0, 9435]}, dtype="int32") tm.assert_frame_equal(expected, result) - assert df["Y"].dtype == result["Y"].dtype From baa1b284b9e64afc60ea711982d31fdbf6b69ca6 Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 18 Apr 2024 00:04:59 +0800 Subject: [PATCH 05/16] Move orig_obj arg in constructor to last param, to account for possible empty param --- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 23b785f282ca1..bbfc35b8e30b7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2048,7 +2048,6 @@ def _gotitem(self, key, ndim: int, subset=None): orig_obj = self.orig_obj if not self.observed else None return SeriesGroupBy( subset, - orig_obj, self.keys, level=self.level, grouper=self._grouper, @@ -2059,6 +2058,7 @@ def _gotitem(self, key, ndim: int, subset=None): group_keys=self.group_keys, observed=self.observed, dropna=self.dropna, + orig_obj=orig_obj, ) raise AssertionError("invalid ndim for _gotitem") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c41dfd818b255..7661902a65579 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1097,7 +1097,6 @@ class GroupBy(BaseGroupBy[NDFrameT]): def __init__( self, obj: NDFrameT, - orig_obj: NDFrameT | None = None, keys: _KeysArgType | None = None, level: IndexLabel | None = None, grouper: ops.BaseGrouper | None = None, @@ -1108,6 +1107,7 @@ def __init__( group_keys: bool = True, observed: bool = False, dropna: bool = True, + orig_obj: NDFrameT | None = None, ) -> None: self._selection = selection From 30013ee067e7ebc03ce4e0fe6c0d44c44205f3e4 Mon Sep 17 00:00:00 2001 From: Kei Date: Sat, 20 Apr 2024 16:38:15 +0800 Subject: [PATCH 06/16] Move calculation of observed grouper to when initialising groupby --- pandas/core/groupby/generic.py | 6 ++-- pandas/core/groupby/groupby.py | 56 +++++++++++++--------------------- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bbfc35b8e30b7..d298139d72554 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2040,12 +2040,13 @@ def _gotitem(self, key, ndim: int, subset=None): group_keys=self.group_keys, observed=self.observed, dropna=self.dropna, + observed_grouper=self.observed_grouper, + observed_exclusions=self.observed_exclusions, ) elif ndim == 1: if subset is None: subset = self.obj[key] - orig_obj = self.orig_obj if not self.observed else None return SeriesGroupBy( subset, self.keys, @@ -2058,7 +2059,8 @@ def _gotitem(self, key, ndim: int, subset=None): group_keys=self.group_keys, observed=self.observed, dropna=self.dropna, - orig_obj=orig_obj, + observed_grouper=self.observed_grouper, + observed_exclusions=self.observed_exclusions, ) raise AssertionError("invalid ndim for _gotitem") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7661902a65579..c34c3a3861255 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -586,9 +586,10 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): "keys", "level", "obj", - "orig_obj", "observed", "sort", + "observed_grouper", + "observed_exclusions", } _grouper: ops.BaseGrouper @@ -1107,7 +1108,8 @@ def __init__( group_keys: bool = True, observed: bool = False, dropna: bool = True, - orig_obj: NDFrameT | None = None, + observed_grouper: ops.BaseGrouper | None = None, + observed_exclusions: frozenset[Hashable] | None = None, ) -> None: self._selection = selection @@ -1119,8 +1121,8 @@ def __init__( self.sort = sort self.group_keys = group_keys self.dropna = dropna - self.orig_obj = obj if orig_obj is None else orig_obj + orig_obj = obj if grouper is None: grouper, exclusions, obj = get_grouper( obj, @@ -1136,6 +1138,21 @@ def __init__( self._grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() + if not observed and observed_grouper is None: + observed_grouper, observed_exclusions, _ = get_grouper( + orig_obj, + self.keys, + level=self.level, + sort=self.sort, + observed=True, + dropna=self.dropna, + ) + + self.observed_grouper = observed_grouper + self.observed_exclusions = ( + frozenset(observed_exclusions) if observed_exclusions else frozenset() + ) + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) @@ -1887,44 +1904,15 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) - grouper, exclusions, obj = get_grouper( - self.orig_obj, - self.keys, - level=self.level, - sort=self.sort, - observed=True, - dropna=self.dropna, - ) - exclusions = frozenset(exclusions) if exclusions else frozenset() - obj_has_not_changed = self.orig_obj.equals(self.obj) - with ( com.temp_setattr(self, "observed", True), - com.temp_setattr(self, "_grouper", grouper), - com.temp_setattr(self, "exclusions", exclusions), - com.temp_setattr(self, "obj", obj, condition=obj_has_not_changed), + com.temp_setattr(self, "_grouper", self.observed_grouper), + com.temp_setattr(self, "exclusions", self.observed_exclusions), ): return self._reduction_kernel_transform( func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) - # with com.temp_setattr(self, "as_index", True): - # # GH#49834 - result needs groups in the index for - # # _wrap_transform_fast_result - # if func in ["idxmin", "idxmax"]: - # func = cast(Literal["idxmin", "idxmax"], func) - # result = self._idxmax_idxmin(func, True, *args, **kwargs) - # else: - # if engine is not None: - # kwargs["engine"] = engine - # kwargs["engine_kwargs"] = engine_kwargs - # result = getattr(self, func)(*args, **kwargs) - - # print("result with observed = False\n", result.to_string()) - # r = self._wrap_transform_fast_result(result) - # print("reindexed result", r.to_string()) - # return r - @final def _reduction_kernel_transform( self, func, *args, engine=None, engine_kwargs=None, **kwargs From 3b9d27b631ee5a5b7ad045030bf38193d3e9465c Mon Sep 17 00:00:00 2001 From: Kei Date: Sat, 20 Apr 2024 17:47:53 +0800 Subject: [PATCH 07/16] Only calculate observed_grouper when grouper is absent to account to edge agg cases --- pandas/core/groupby/groupby.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c34c3a3861255..9adb6d16502ee 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1122,7 +1122,21 @@ def __init__( self.group_keys = group_keys self.dropna = dropna - orig_obj = obj + if not observed and grouper is None: + observed_grouper, observed_exclusions, _ = get_grouper( + obj, + self.keys, + level=self.level, + sort=self.sort, + observed=True, + dropna=self.dropna, + ) + + self.observed_grouper = observed_grouper + self.observed_exclusions = ( + frozenset(observed_exclusions) if observed_exclusions else frozenset() + ) + if grouper is None: grouper, exclusions, obj = get_grouper( obj, @@ -1138,21 +1152,6 @@ def __init__( self._grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() - if not observed and observed_grouper is None: - observed_grouper, observed_exclusions, _ = get_grouper( - orig_obj, - self.keys, - level=self.level, - sort=self.sort, - observed=True, - dropna=self.dropna, - ) - - self.observed_grouper = observed_grouper - self.observed_exclusions = ( - frozenset(observed_exclusions) if observed_exclusions else frozenset() - ) - def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) From 8588a1e522c2c7163c7dec2f2af88d8a0f486487 Mon Sep 17 00:00:00 2001 From: Kei Date: Sat, 20 Apr 2024 19:59:52 +0800 Subject: [PATCH 08/16] Remove observed exclusions --- pandas/core/groupby/generic.py | 2 -- pandas/core/groupby/groupby.py | 8 +------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d298139d72554..09ad065d8a6ac 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2041,7 +2041,6 @@ def _gotitem(self, key, ndim: int, subset=None): observed=self.observed, dropna=self.dropna, observed_grouper=self.observed_grouper, - observed_exclusions=self.observed_exclusions, ) elif ndim == 1: if subset is None: @@ -2060,7 +2059,6 @@ def _gotitem(self, key, ndim: int, subset=None): observed=self.observed, dropna=self.dropna, observed_grouper=self.observed_grouper, - observed_exclusions=self.observed_exclusions, ) raise AssertionError("invalid ndim for _gotitem") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7f675e45eab2e..391b5b200deed 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -589,7 +589,6 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): "observed", "sort", "observed_grouper", - "observed_exclusions", } _grouper: ops.BaseGrouper @@ -1109,7 +1108,6 @@ def __init__( observed: bool = False, dropna: bool = True, observed_grouper: ops.BaseGrouper | None = None, - observed_exclusions: frozenset[Hashable] | None = None, ) -> None: self._selection = selection @@ -1123,7 +1121,7 @@ def __init__( self.dropna = dropna if not observed and grouper is None: - observed_grouper, observed_exclusions, _ = get_grouper( + observed_grouper, _, _ = get_grouper( obj, self.keys, level=self.level, @@ -1133,9 +1131,6 @@ def __init__( ) self.observed_grouper = observed_grouper - self.observed_exclusions = ( - frozenset(observed_exclusions) if observed_exclusions else frozenset() - ) if grouper is None: grouper, exclusions, obj = get_grouper( @@ -1905,7 +1900,6 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): with ( com.temp_setattr(self, "observed", True), com.temp_setattr(self, "_grouper", self.observed_grouper), - com.temp_setattr(self, "exclusions", self.observed_exclusions), ): return self._reduction_kernel_transform( func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs From 8e669d9e6c04f83d79141abf7f03e320b49a0246 Mon Sep 17 00:00:00 2001 From: Kei Date: Sat, 20 Apr 2024 20:44:04 +0800 Subject: [PATCH 09/16] Add observed grouper/grouping as cached method --- pandas/core/groupby/generic.py | 2 -- pandas/core/groupby/groupby.py | 16 +--------------- pandas/core/groupby/grouper.py | 18 ++++++++++++++++++ pandas/core/groupby/ops.py | 13 +++++++++++++ 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 09ad065d8a6ac..75f04a5a303a8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2040,7 +2040,6 @@ def _gotitem(self, key, ndim: int, subset=None): group_keys=self.group_keys, observed=self.observed, dropna=self.dropna, - observed_grouper=self.observed_grouper, ) elif ndim == 1: if subset is None: @@ -2058,7 +2057,6 @@ def _gotitem(self, key, ndim: int, subset=None): group_keys=self.group_keys, observed=self.observed, dropna=self.dropna, - observed_grouper=self.observed_grouper, ) raise AssertionError("invalid ndim for _gotitem") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 391b5b200deed..75bf03fbeac75 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -588,7 +588,6 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): "obj", "observed", "sort", - "observed_grouper", } _grouper: ops.BaseGrouper @@ -1107,7 +1106,6 @@ def __init__( group_keys: bool = True, observed: bool = False, dropna: bool = True, - observed_grouper: ops.BaseGrouper | None = None, ) -> None: self._selection = selection @@ -1120,18 +1118,6 @@ def __init__( self.group_keys = group_keys self.dropna = dropna - if not observed and grouper is None: - observed_grouper, _, _ = get_grouper( - obj, - self.keys, - level=self.level, - sort=self.sort, - observed=True, - dropna=self.dropna, - ) - - self.observed_grouper = observed_grouper - if grouper is None: grouper, exclusions, obj = get_grouper( obj, @@ -1899,7 +1885,7 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): with ( com.temp_setattr(self, "observed", True), - com.temp_setattr(self, "_grouper", self.observed_grouper), + com.temp_setattr(self, "_grouper", self._grouper.observed_grouper), ): return self._reduction_kernel_transform( func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2d10bd5d00eb2..615785c363072 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -668,6 +668,24 @@ def groups(self) -> dict[Hashable, Index]: cats = Categorical.from_codes(codes, uniques, validate=False) return self._index.groupby(cats) + @cache_readonly + def observed_grouping(self) -> Grouper: + if self._observed: + return self + + grouping = Grouping( + self._index, + self._orig_grouper, + obj=self.obj, + level=self.level, + sort=self._sort, + observed=True, + in_axis=self.in_axis, + dropna=self._dropna, + uniques=self._uniques, + ) + return grouping + def get_grouper( obj: NDFrameT, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index effa94b1606bd..e5c542f35d050 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -823,6 +823,15 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: return result_index, ids + @cache_readonly + def observed_grouper(self) -> BaseGrouper: + if all(ping._observed for ping in self.groupings): + return self + + groupings = [ping.observed_grouping for ping in self.groupings] + grouper = BaseGrouper(self.axis, groupings, sort=self._sort, dropna=self.dropna) + return grouper + def _ob_index_and_ids( self, levels: list[Index], @@ -1154,6 +1163,10 @@ def groupings(self) -> list[grouper.Grouping]: ) return [ping] + @cache_readonly + def observed_grouper(self) -> BinGrouper: + return self + def _is_indexed_like(obj, axes) -> bool: if isinstance(obj, Series): From 0d9f89de4b438204778585c22706ae8015a865a3 Mon Sep 17 00:00:00 2001 From: Kei Date: Sat, 20 Apr 2024 21:40:21 +0800 Subject: [PATCH 10/16] change return type to grouping --- pandas/core/groupby/grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 615785c363072..75aae104764c9 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -669,7 +669,7 @@ def groups(self) -> dict[Hashable, Index]: return self._index.groupby(cats) @cache_readonly - def observed_grouping(self) -> Grouper: + def observed_grouping(self) -> Grouping: if self._observed: return self From 84f83aefca48082117c0392aa9578dd317beded8 Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 25 Apr 2024 20:32:15 +0800 Subject: [PATCH 11/16] Update rst docs --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f291820bc5266..1ca8c82404074 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -429,7 +429,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - +- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` coerces dtype (:issue:`55326`) Reshaping ^^^^^^^^^ From cbabce0d88a3bbd8d574ed4631e9279030704273 Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 25 Apr 2024 20:39:28 +0800 Subject: [PATCH 12/16] Update rst docs --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1ca8c82404074..2b9ddc7990d54 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -429,7 +429,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) -- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` coerces dtype (:issue:`55326`) +- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) Reshaping ^^^^^^^^^ From bcca14fcbfff74b0e94657438c5b3f0157b27e3c Mon Sep 17 00:00:00 2001 From: Kei Date: Mon, 29 Apr 2024 15:25:18 +0800 Subject: [PATCH 13/16] Cache observed grouping/grouper instead of self obj --- pandas/core/groupby/grouper.py | 6 +++++- pandas/core/groupby/ops.py | 8 ++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 75aae104764c9..e75a5b9089f5f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -668,11 +668,15 @@ def groups(self) -> dict[Hashable, Index]: cats = Categorical.from_codes(codes, uniques, validate=False) return self._index.groupby(cats) - @cache_readonly + @property def observed_grouping(self) -> Grouping: if self._observed: return self + return self._observed_grouping + + @cache_readonly + def _observed_grouping(self) -> Grouping: grouping = Grouping( self._index, self._orig_grouper, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e5c542f35d050..4f40c4f4283f0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -823,11 +823,15 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: return result_index, ids - @cache_readonly + @property def observed_grouper(self) -> BaseGrouper: if all(ping._observed for ping in self.groupings): return self + return self._observed_grouper + + @cache_readonly + def _observed_grouper(self) -> BaseGrouper: groupings = [ping.observed_grouping for ping in self.groupings] grouper = BaseGrouper(self.axis, groupings, sort=self._sort, dropna=self.dropna) return grouper @@ -1163,7 +1167,7 @@ def groupings(self) -> list[grouper.Grouping]: ) return [ping] - @cache_readonly + @property def observed_grouper(self) -> BinGrouper: return self From f3a3f639051de04bd3ab3df9d09253923cde923d Mon Sep 17 00:00:00 2001 From: Kei Date: Thu, 2 May 2024 11:12:56 +0800 Subject: [PATCH 14/16] Update according to pr comments --- pandas/core/generic.py | 1 - .../tests/groupby/transform/test_transform.py | 119 ++---------------- 2 files changed, 8 insertions(+), 112 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dbe2006642484..4b0d64bed9b38 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2044,7 +2044,6 @@ def __setstate__(self, state) -> None: object.__setattr__(self, "_attrs", attrs) flags = state.get("_flags", {"allows_duplicate_labels": True}) object.__setattr__(self, "_flags", Flags(self, **flags)) - # set in the order of internal names # to avoid definitional recursion # e.g. say fill_value needing _mgr to be diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 81fdd49492a66..d6d545a8c4834 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1537,128 +1537,25 @@ def test_transform_sum_one_column_with_matching_labels_and_missing_labels(): tm.assert_frame_equal(result, expected) -# GH#58084 -def test_min_one_unobserved_category_no_type_coercion(): +@pytest.mark.parametrize("dtype", ["int32", "float32"]) +def test_min_one_unobserved_category_no_type_coercion(dtype): + # GH#58084 df = DataFrame({"A": Categorical([1, 1, 2], categories=[1, 2, 3]), "B": [3, 4, 5]}) - df["B"] = df["B"].astype("int32") + df["B"] = df["B"].astype(dtype) gb = df.groupby("A", observed=False) result = gb.transform("min") - expected = DataFrame({"B": [3, 3, 5]}, dtype="int32") + expected = DataFrame({"B": [3, 3, 5]}, dtype=dtype) tm.assert_frame_equal(expected, result) -# GH#58084 -def test_min_multiple_unobserved_categories_no_type_coercion(): - df = DataFrame( - { - "X": Categorical( - ["432945", "randomcat", -4325466, "randomcat", -4325466, -4325466], - categories=[ - 1, - "randomcat", - 100, - 333, - "cat43543", - -4325466, - 54665, - -546767, - "432945", - 767076, - ], - ), - "Y": [0, 940645, np.iinfo(np.int64).min, 9449, 100044444, 40], - } - ) - df["Y"] = df["Y"].astype("int64") - - gb = df.groupby("X", observed=False) - result = gb.transform("min") - - expected = DataFrame( - { - "Y": [ - 0, - 9449, - np.iinfo(np.int64).min, - 9449, - np.iinfo(np.int64).min, - np.iinfo(np.int64).min, - ] - }, - dtype="int64", - ) - tm.assert_frame_equal(expected, result) - - -# GH#58084 -def test_min_float32_multiple_unobserved_categories_no_type_coercion(): - df = DataFrame( - { - "X": Categorical( - ["cat43543", -4325466, 54665, "cat43543", -4325466, 54665], - categories=[ - 1, - "randomcat", - 100, - 333, - "cat43543", - -4325466, - 54665, - -546767, - "432945", - 767076, - ], - ), - "Y": [ - 0.3940429, - 940645.49, - np.finfo(np.float32).min, - 9449.03333, - 100044444.403294, - 40.3020909, - ], - } - ) - df["Y"] = df["Y"].astype("float32") - - gb = df.groupby("X", observed=False) - result = gb.transform("min") - - expected = DataFrame( - { - "Y": [ - 0.3940429, - 940645.49, - np.finfo(np.float32).min, - 0.3940429, - 940645.49, - np.finfo(np.float32).min, - ] - }, - dtype="float32", - ) - tm.assert_frame_equal(expected, result) - - -# GH#58084 def test_min_all_empty_data_no_type_coercion(): + # GH#58084 df = DataFrame( { "X": Categorical( [], - categories=[ - 1, - "randomcat", - 100, - 333, - "cat43543", - -4325466, - 54665, - -546767, - "432945", - 767076, - ], + categories=[1, "randomcat", 100], ), "Y": [], } @@ -1672,8 +1569,8 @@ def test_min_all_empty_data_no_type_coercion(): tm.assert_frame_equal(expected, result) -# GH#58084 def test_min_one_dim_no_type_coercion(): + # GH#58084 df = DataFrame({"Y": [9435, -5465765, 5055, 0, 954960]}) df["Y"] = df["Y"].astype("int32") categories = Categorical([1, 2, 2, 5, 1], categories=[1, 2, 3, 4, 5]) From 7f99b7114d88c77bf071aef2993d20b54188534a Mon Sep 17 00:00:00 2001 From: Kei Date: Tue, 7 May 2024 10:47:09 +0800 Subject: [PATCH 15/16] Revert unintentional changes --- pandas/core/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 930bff5679148..24727bb9d83c1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2055,6 +2055,7 @@ def __setstate__(self, state) -> None: object.__setattr__(self, "_attrs", attrs) flags = state.get("_flags", {"allows_duplicate_labels": True}) object.__setattr__(self, "_flags", Flags(self, **flags)) + # set in the order of internal names # to avoid definitional recursion # e.g. say fill_value needing _mgr to be From 43644408fc2b9868b4f7a55be072ebecbc965927 Mon Sep 17 00:00:00 2001 From: Kei Date: Tue, 7 May 2024 10:48:12 +0800 Subject: [PATCH 16/16] Revert unintentional changes --- pandas/core/groupby/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 75f04a5a303a8..0a048d11d0b4d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2044,7 +2044,6 @@ def _gotitem(self, key, ndim: int, subset=None): elif ndim == 1: if subset is None: subset = self.obj[key] - return SeriesGroupBy( subset, self.keys,