From 0496516dfdbdcbed303fa8e99a8c31898a02661d Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 29 Mar 2020 18:51:48 -0500 Subject: [PATCH 01/10] Convert to numpy --- pandas/core/groupby/groupby.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 86171944d0c78..0cef2364845de 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -42,6 +42,7 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float, + is_extension_array_dtype, is_datetime64_dtype, is_integer_dtype, is_numeric_dtype, @@ -2241,6 +2242,8 @@ def _get_cythonized_result( for idx, obj in enumerate(self._iterate_slices()): name = obj.name values = obj._values + if is_extension_array_dtype(values.dtype) and is_integer_dtype(values.dtype): + values = values.to_numpy(dtype=cython_dtype, na_value=np.nan) if aggregate: result_sz = ngroups From 5d7a4a5e8247da2c7d22d482680ce024fbcc86f2 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 29 Mar 2020 19:39:21 -0500 Subject: [PATCH 02/10] Test --- pandas/tests/groupby/test_apply.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 18ad5d90b3f60..49d466171e12f 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -710,6 +710,30 @@ def test_apply_with_mixed_types(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("q", [0.5, [0.25, 0.5, 0.75]]) +def test_groupby_quantile_nullable_integer(q): + # https://github.com/pandas-dev/pandas/issues/33136 + values = np.arange(100, dtype=float) + values[::3] = np.nan + na_mask = np.isnan(values) + + df = pd.DataFrame( + {"a": ["x"] * 100 + ["y"] * 100, "b": pd.array(list(values) * 2, dtype="Int64")} + ) + grouped = df.groupby("a")["b"] + result = grouped.quantile(q) + + if isinstance(q, list): + idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) + np_result = list(np.quantile(values[~na_mask], q=q)) + else: + idx = pd.Index(["x", "y"], name="a") + np_result = [np.quantile(values[~na_mask], q=q)] + + expected = pd.Series(np_result * 2, index=idx, name="b") + tm.assert_series_equal(result, expected) + + def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=pd.Int64Index([1, 2])) From 0e1a1a92a9b7be60ef42c69c4a28c4950e10cd16 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 29 Mar 2020 19:40:52 -0500 Subject: [PATCH 03/10] Note --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 58ac2b4cba3b7..106fd89055246 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -421,7 +421,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) - +- Bug in :meth:`SeriesGroupBy.quantile` raising on nullable integers (:issue:`33136`) Reshaping ^^^^^^^^^ From 409dbf49fa11c8cd06d38036319f2771e1de76e1 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 29 Mar 2020 19:45:31 -0500 Subject: [PATCH 04/10] Format --- pandas/core/groupby/groupby.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0cef2364845de..ba4f1474693cd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -42,8 +42,8 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float, - is_extension_array_dtype, is_datetime64_dtype, + is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, is_object_dtype, @@ -2242,7 +2242,9 @@ def _get_cythonized_result( for idx, obj in enumerate(self._iterate_slices()): name = obj.name values = obj._values - if is_extension_array_dtype(values.dtype) and is_integer_dtype(values.dtype): + if is_extension_array_dtype(values.dtype) and is_integer_dtype( + values.dtype + ): values = values.to_numpy(dtype=cython_dtype, na_value=np.nan) if aggregate: From c2f12c917f7842ab8b162ed255732ec48f3356c6 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 29 Mar 2020 20:23:09 -0500 Subject: [PATCH 05/10] Hard code --- pandas/tests/groupby/test_apply.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 49d466171e12f..a1ecbb4cd459c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -725,12 +725,12 @@ def test_groupby_quantile_nullable_integer(q): if isinstance(q, list): idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) - np_result = list(np.quantile(values[~na_mask], q=q)) + true_quantiles = [25.25, 49.5, 73.75] else: idx = pd.Index(["x", "y"], name="a") - np_result = [np.quantile(values[~na_mask], q=q)] + true_quantiles = [49.5] - expected = pd.Series(np_result * 2, index=idx, name="b") + expected = pd.Series(true_quantiles * 2, index=idx, name="b") tm.assert_series_equal(result, expected) From 065ce081c31a1d4912b1e704e4f6419d401f1bb6 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 29 Mar 2020 20:24:52 -0500 Subject: [PATCH 06/10] Lint --- pandas/tests/groupby/test_apply.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index a1ecbb4cd459c..5a5d4059c6047 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -715,7 +715,6 @@ def test_groupby_quantile_nullable_integer(q): # https://github.com/pandas-dev/pandas/issues/33136 values = np.arange(100, dtype=float) values[::3] = np.nan - na_mask = np.isnan(values) df = pd.DataFrame( {"a": ["x"] * 100 + ["y"] * 100, "b": pd.array(list(values) * 2, dtype="Int64")} From 4a8ed587696da252df13001619328d699d8a8ef4 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 29 Mar 2020 20:28:14 -0500 Subject: [PATCH 07/10] Nit --- pandas/tests/groupby/test_apply.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5a5d4059c6047..84bcd2e69b6e8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -719,8 +719,7 @@ def test_groupby_quantile_nullable_integer(q): df = pd.DataFrame( {"a": ["x"] * 100 + ["y"] * 100, "b": pd.array(list(values) * 2, dtype="Int64")} ) - grouped = df.groupby("a")["b"] - result = grouped.quantile(q) + result = df.groupby("a")["b"].quantile(q) if isinstance(q, list): idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) From 4a88dabd3e832a4c60bf478b8bfc47c39508a80f Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 30 Mar 2020 12:11:37 -0500 Subject: [PATCH 08/10] Move test --- pandas/tests/groupby/test_apply.py | 22 ---------------------- pandas/tests/groupby/test_function.py | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 84bcd2e69b6e8..18ad5d90b3f60 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -710,28 +710,6 @@ def test_apply_with_mixed_types(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("q", [0.5, [0.25, 0.5, 0.75]]) -def test_groupby_quantile_nullable_integer(q): - # https://github.com/pandas-dev/pandas/issues/33136 - values = np.arange(100, dtype=float) - values[::3] = np.nan - - df = pd.DataFrame( - {"a": ["x"] * 100 + ["y"] * 100, "b": pd.array(list(values) * 2, dtype="Int64")} - ) - result = df.groupby("a")["b"].quantile(q) - - if isinstance(q, list): - idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) - true_quantiles = [25.25, 49.5, 73.75] - else: - idx = pd.Index(["x", "y"], name="a") - true_quantiles = [49.5] - - expected = pd.Series(true_quantiles * 2, index=idx, name="b") - tm.assert_series_equal(result, expected) - - def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=pd.Int64Index([1, 2])) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 9c33843cdcecc..b93fea01177e3 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1519,6 +1519,28 @@ def test_quantile_missing_group_values_correct_results(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("q", [0.5, [0.25, 0.5, 0.75]]) +def test_groupby_quantile_nullable_integer(q): + # https://github.com/pandas-dev/pandas/issues/33136 + values = np.arange(100, dtype=float) + values[::3] = np.nan + + df = pd.DataFrame( + {"a": ["x"] * 100 + ["y"] * 100, "b": pd.array(list(values) * 2, dtype="Int64")} + ) + result = df.groupby("a")["b"].quantile(q) + + if isinstance(q, list): + idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) + true_quantiles = [25.25, 49.5, 73.75] + else: + idx = pd.Index(["x", "y"], name="a") + true_quantiles = [49.5] + + expected = pd.Series(true_quantiles * 2, index=idx, name="b") + tm.assert_series_equal(result, expected) + + # pipe # -------------------------------- From fcc00cba1e0d706f248d6851d10816d4bcaf2019 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 30 Mar 2020 19:27:52 -0500 Subject: [PATCH 09/10] Move check --- pandas/core/groupby/groupby.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 50b04d0926792..89d1df934655b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -42,6 +42,7 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float, + is_bool_dtype, is_datetime64_dtype, is_extension_array_dtype, is_integer_dtype, @@ -1862,9 +1863,13 @@ def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: ) inference = None - if is_integer_dtype(vals): + if is_integer_dtype(vals.dtype): + if is_extension_array_dtype(vals.dtype): + vals = vals.to_numpy(dtype=float, na_value=np.nan) inference = np.int64 - elif is_datetime64_dtype(vals): + elif is_bool_dtype(vals.dtype) and is_extension_array_dtype(vals.dtype): + vals = vals.to_numpy(dtype=float, na_value=np.nan) + elif is_datetime64_dtype(vals.dtype): inference = "datetime64[ns]" vals = np.asarray(vals).astype(np.float) @@ -2242,10 +2247,6 @@ def _get_cythonized_result( for idx, obj in enumerate(self._iterate_slices()): name = obj.name values = obj._values - if is_extension_array_dtype(values.dtype) and is_integer_dtype( - values.dtype - ): - values = values.to_numpy(dtype=cython_dtype, na_value=np.nan) if aggregate: result_sz = ngroups From bb69e3d574c47bf15cdedb37e1e76e7790f9fc87 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 30 Mar 2020 19:28:28 -0500 Subject: [PATCH 10/10] Update test --- pandas/tests/groupby/test_function.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index b93fea01177e3..346de55f551df 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1519,23 +1519,25 @@ def test_quantile_missing_group_values_correct_results(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("q", [0.5, [0.25, 0.5, 0.75]]) -def test_groupby_quantile_nullable_integer(q): +@pytest.mark.parametrize( + "values", + [ + pd.array([1, 0, None] * 2, dtype="Int64"), + pd.array([True, False, None] * 2, dtype="boolean"), + ], +) +@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) +def test_groupby_quantile_nullable_array(values, q): # https://github.com/pandas-dev/pandas/issues/33136 - values = np.arange(100, dtype=float) - values[::3] = np.nan - - df = pd.DataFrame( - {"a": ["x"] * 100 + ["y"] * 100, "b": pd.array(list(values) * 2, dtype="Int64")} - ) + df = pd.DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values}) result = df.groupby("a")["b"].quantile(q) if isinstance(q, list): idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) - true_quantiles = [25.25, 49.5, 73.75] + true_quantiles = [0.0, 0.5, 1.0] else: idx = pd.Index(["x", "y"], name="a") - true_quantiles = [49.5] + true_quantiles = [0.5] expected = pd.Series(true_quantiles * 2, index=idx, name="b") tm.assert_series_equal(result, expected)