From dac516843ea354312152ffacdc2da40865052527 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 20 Aug 2022 00:29:46 -0700 Subject: [PATCH] BUG: DataFrame.select_dtypes(include=number) still included BooleanDtype (#48169) --- doc/source/whatsnew/v1.4.4.rst | 1 + pandas/core/frame.py | 5 ++++- pandas/tests/frame/methods/test_select_dtypes.py | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 3414082581b80b..2f290217bf9854 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in taking NULL :class:`objects` from a :class:`DataFrame` causing a segmentation violation. These NULL values are created by :meth:`numpy.empty_like` (:issue:`46848`) - Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`) - Fixed regression in :func:`cut` using a ``datetime64`` IntervalIndex as bins (:issue:`46218`) +- Fixed regression in :meth:`DataFrame.select_dtypes` where ``include="number"`` included :class:`BooleanDtype` (:issue:`46870`) - Fixed regression in :meth:`DataFrame.loc` not updating the cache correctly after values were set (:issue:`47867`) - Fixed regression in :meth:`DataFrame.loc` not aligning index in some cases when setting a :class:`DataFrame` (:issue:`47578`) - Fixed regression in :meth:`DataFrame.loc` setting a length-1 array like value to a single value in the DataFrame (:issue:`46268`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 04e168f1ab6cad..bcc7a2ae8f83f2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4687,8 +4687,11 @@ def check_int_infer_dtype(dtypes): raise ValueError(f"include and exclude overlap on {(include & exclude)}") def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: + # GH 46870: BooleanDtype._is_numeric == True but should be excluded return issubclass(dtype.type, tuple(dtypes_set)) or ( - np.number in dtypes_set and getattr(dtype, "_is_numeric", False) + np.number in dtypes_set + and getattr(dtype, "_is_numeric", False) + and not is_bool_dtype(dtype) ) def predicate(arr: ArrayLike) -> bool: diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 9da6b61e676037..6ff5a41b67ec2c 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -441,3 +441,18 @@ def test_select_dtypes_float_dtype(self, expected, float_dtypes): df = df.astype(dtype_dict) result = df.select_dtypes(include=float_dtypes) tm.assert_frame_equal(result, expected) + + def test_np_bool_ea_boolean_include_number(self): + # GH 46870 + df = DataFrame( + { + "a": [1, 2, 3], + "b": pd.Series([True, False, True], dtype="boolean"), + "c": np.array([True, False, True]), + "d": pd.Categorical([True, False, True]), + "e": pd.arrays.SparseArray([True, False, True]), + } + ) + result = df.select_dtypes(include="number") + expected = DataFrame({"a": [1, 2, 3]}) + tm.assert_frame_equal(result, expected)