From 47c9ee7b32d8bde2ff8cf50288a8787b11d512cb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 6 Jan 2023 15:08:28 -0800 Subject: [PATCH] DEPR: Enforce certain DataFrame reductions w/ axis=None to return scalars (#50593) --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/frame.py | 10 ++-- pandas/core/generic.py | 51 ++++++++----------- pandas/tests/frame/test_reductions.py | 31 +++++------ pandas/tests/groupby/test_categorical.py | 8 +-- pandas/tests/groupby/test_function.py | 16 ++---- .../tests/groupby/transform/test_transform.py | 17 ++----- 7 files changed, 54 insertions(+), 80 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3b049f3059666..e44ae2cb40826 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -715,6 +715,7 @@ Removal of prior version deprecations/changes - Changed behavior of :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype and an incompatible ``fill_value``; this now casts to ``object`` dtype instead of raising, consistent with the behavior with other dtypes (:issue:`45746`) - Change the default argument of ``regex`` for :meth:`Series.str.replace` from ``True`` to ``False``. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal. (:issue:`36695`, :issue:`24804`) - Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`) +- Changed behavior of :meth:`DataFrame.max`, :class:`DataFrame.min`, :class:`DataFrame.mean`, :class:`DataFrame.median`, :class:`DataFrame.skew`, :class:`DataFrame.kurt` with ``axis=None`` to return a scalar applying the aggregation across both axes (:issue:`45072`) - Changed behavior of comparison of a :class:`Timestamp` with a ``datetime.date`` object; these now compare as un-equal and raise on inequality comparisons, matching the ``datetime.datetime`` behavior (:issue:`36131`) - Changed behavior of comparison of ``NaT`` with a ``datetime.date`` object; these now raise on inequality comparisons (:issue:`39196`) - Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fed92dc80a99b..6491081c54592 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10357,9 +10357,8 @@ def _reduce( assert filter_type is None or filter_type == "bool", filter_type out_dtype = "bool" if filter_type == "bool" else None - # TODO: Make other agg func handle axis=None properly GH#21597 - axis = self._get_axis_number(axis) - assert axis in [0, 1] + if axis is not None: + axis = self._get_axis_number(axis) def func(values: np.ndarray): # We only use this in the case that operates on self.values @@ -10410,7 +10409,7 @@ def _get_data() -> DataFrame: return out - assert not numeric_only and axis == 1 + assert not numeric_only and axis in (1, None) data = self values = data.values @@ -10426,6 +10425,9 @@ def _get_data() -> DataFrame: # try to coerce to the original dtypes item by item if we can pass + if axis is None: + return result + labels = self._get_agg_axis(axis) result = self._constructor_sliced(result, index=labels) return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3cade2568d921..561422c868e91 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10944,7 +10944,7 @@ def _stat_function( self, name: str, func, - axis: Axis | None | lib.NoDefault = None, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -10956,30 +10956,13 @@ def _stat_function( validate_bool_kwarg(skipna, "skipna", none_allowed=False) - if axis is None and self.ndim > 1: - # user must have explicitly passed axis=None - # GH#21597 - warnings.warn( - f"In a future version, DataFrame.{name}(axis=None) will return a " - f"scalar {name} over the entire DataFrame. To retain the old " - f"behavior, use 'frame.{name}(axis=0)' or just 'frame.{name}()'", - FutureWarning, - stacklevel=find_stack_level(), - ) - - if axis is lib.no_default: - axis = None - - if axis is None: - axis = self._stat_axis_number - return self._reduce( func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only ) def min( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -10995,7 +10978,7 @@ def min( def max( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -11011,7 +10994,7 @@ def max( def mean( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -11022,7 +11005,7 @@ def mean( def median( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -11033,7 +11016,7 @@ def median( def skew( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -11044,7 +11027,7 @@ def skew( def kurt( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -11366,7 +11349,7 @@ def prod( ) def mean( self, - axis: AxisInt | None | lib.NoDefault = lib.no_default, + axis: AxisInt | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -11387,7 +11370,7 @@ def mean( ) def skew( self, - axis: AxisInt | None | lib.NoDefault = lib.no_default, + axis: AxisInt | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -11411,7 +11394,7 @@ def skew( ) def kurt( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -11433,7 +11416,7 @@ def kurt( ) def median( self, - axis: AxisInt | None | lib.NoDefault = lib.no_default, + axis: AxisInt | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -11456,7 +11439,7 @@ def median( ) def max( self, - axis: AxisInt | None | lib.NoDefault = lib.no_default, + axis: AxisInt | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -11479,7 +11462,7 @@ def max( ) def min( self, - axis: AxisInt | None | lib.NoDefault = lib.no_default, + axis: AxisInt | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, **kwargs, @@ -11708,6 +11691,12 @@ def _doc_params(cls): axis : {axis_descr} Axis for the function to be applied on. For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + skipna : bool, default True Exclude NA/null values when computing the result. numeric_only : bool, default False @@ -11719,7 +11708,7 @@ def _doc_params(cls): Returns ------- -{name1} or {name2} (if level specified)\ +{name1} or scalar\ {see_also}\ {examples} """ diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 2e0aa5fd0cf40..f1d176e59373f 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1488,7 +1488,6 @@ def test_median_categorical_dtype_nuisance_column(self): # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead # of expected.values - @pytest.mark.filterwarnings("ignore:.*will return a scalar.*:FutureWarning") @pytest.mark.parametrize("method", ["min", "max"]) def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): # GH#28949 DataFrame.min should behave like Series.min @@ -1510,7 +1509,7 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): getattr(df, method)() with pytest.raises(TypeError, match="is not ordered for operation"): - getattr(np, method)(df) + getattr(np, method)(df, axis=0) # same thing, but with an additional non-categorical column df["B"] = df["A"].astype(object) @@ -1518,7 +1517,7 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): getattr(df, method)() with pytest.raises(TypeError, match="is not ordered for operation"): - getattr(np, method)(df) + getattr(np, method)(df, axis=0) def test_sum_timedelta64_skipna_false(using_array_manager, request): @@ -1600,20 +1599,22 @@ def test_prod_sum_min_count_mixed_object(): @pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"]) -def test_reduction_axis_none_deprecation(method): - # GH#21597 deprecate axis=None defaulting to axis=0 so that we can change it - # to reducing over all axes. +def test_reduction_axis_none_returns_scalar(method): + # GH#21597 As of 2.0, axis=None reduces over all axes. df = DataFrame(np.random.randn(4, 4)) - meth = getattr(df, method) - - msg = f"scalar {method} over the entire DataFrame" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = meth(axis=None) - with tm.assert_produces_warning(None): - expected = meth() - tm.assert_series_equal(res, expected) - tm.assert_series_equal(res, meth(axis=0)) + + result = getattr(df, method)(axis=None) + np_arr = df.to_numpy() + if method in {"skew", "kurt"}: + comp_mod = pytest.importorskip("scipy.stats") + if method == "kurt": + method = "kurtosis" + expected = getattr(comp_mod, method)(np_arr, bias=False, axis=None) + tm.assert_almost_equal(result, expected) + else: + expected = getattr(np, method)(np_arr, axis=None) + assert result == expected @pytest.mark.parametrize( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index d9181830925f7..0fabdf84e5e86 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -147,11 +147,7 @@ def f(x): tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]]) gbc = df.groupby(c, observed=False) - with tm.assert_produces_warning( - FutureWarning, match="scalar max", check_stacklevel=False - ): - # stacklevel is thrown off (i think) bc the stack goes through numpy C code - result = gbc.transform(lambda xs: np.max(xs)) + result = gbc.transform(lambda xs: np.max(xs, axis=0)) tm.assert_frame_equal(result, df[["a"]]) with tm.assert_produces_warning(None): @@ -295,7 +291,7 @@ def test_apply(ordered): idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"]) - result = grouped.apply(lambda x: np.mean(x)) + result = grouped.apply(lambda x: np.mean(x, axis=0)) tm.assert_frame_equal(result, expected) result = grouped.mean() diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index bb15783f4607f..875037b390883 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -80,20 +80,12 @@ def test_builtins_apply(keys, f): assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" assert result.shape == (ngroups, 3), assert_msg - npfunc = getattr(np, fname) # numpy's equivalent function - if f in [max, min]: - warn = FutureWarning - else: - warn = None - msg = "scalar (max|min) over the entire DataFrame" - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - # stacklevel can be thrown off because (i think) the stack - # goes through some of numpy's C code. - expected = gb.apply(npfunc) + npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function + expected = gb.apply(npfunc) tm.assert_frame_equal(result, expected) with tm.assert_produces_warning(None): - expected2 = gb.apply(lambda x: npfunc(x, axis=0)) + expected2 = gb.apply(lambda x: npfunc(x)) tm.assert_frame_equal(result, expected2) if f != sum: @@ -101,7 +93,7 @@ def test_builtins_apply(keys, f): expected.set_index(keys, inplace=True, drop=False) tm.assert_frame_equal(result, expected, check_dtype=False) - tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)()) + tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0)) class TestNumericOnly: diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 8f1d52c2ea03d..cb3b7def39c52 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -57,7 +57,7 @@ def test_transform(): tm.assert_frame_equal(result, expected) def demean(arr): - return arr - arr.mean() + return arr - arr.mean(axis=0) people = DataFrame( np.random.randn(5, 5), @@ -144,7 +144,7 @@ def test_transform_broadcast(tsframe, ts): result = grouped.transform(np.mean) tm.assert_index_equal(result.index, tsframe.index) for _, gp in grouped: - agged = gp.mean() + agged = gp.mean(axis=0) res = result.reindex(gp.index) for col in tsframe: assert_fp_equal(res[col], agged[col]) @@ -214,7 +214,7 @@ def test_transform_axis_ts(tsframe): ts = tso grouped = ts.groupby(lambda x: x.weekday(), group_keys=False) result = ts - grouped.transform("mean") - expected = grouped.apply(lambda x: x - x.mean()) + expected = grouped.apply(lambda x: x - x.mean(axis=0)) tm.assert_frame_equal(result, expected) ts = ts.T @@ -227,7 +227,7 @@ def test_transform_axis_ts(tsframe): ts = tso.iloc[[1, 0] + list(range(2, len(base)))] grouped = ts.groupby(lambda x: x.weekday(), group_keys=False) result = ts - grouped.transform("mean") - expected = grouped.apply(lambda x: x - x.mean()) + expected = grouped.apply(lambda x: x - x.mean(axis=0)) tm.assert_frame_equal(result, expected) ts = ts.T @@ -477,16 +477,9 @@ def test_transform_coercion(): expected = g.transform(np.mean) - # in 2.0 np.mean on a DataFrame is equivalent to frame.mean(axis=None) - # which not gives a scalar instead of Series - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g.transform(lambda x: np.mean(x)) + result = g.transform(lambda x: np.mean(x, axis=0)) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(None): - result2 = g.transform(lambda x: np.mean(x, axis=0)) - tm.assert_frame_equal(result2, expected) - def test_groupby_transform_with_int():