Skip to content

Commit

Permalink
DEPR: Enforce certain DataFrame reductions w/ axis=None to return sca…
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored Jan 6, 2023
1 parent 4520f84 commit 47c9ee7
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 80 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,7 @@ Removal of prior version deprecations/changes
- Changed behavior of :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype and an incompatible ``fill_value``; this now casts to ``object`` dtype instead of raising, consistent with the behavior with other dtypes (:issue:`45746`)
- Change the default argument of ``regex`` for :meth:`Series.str.replace` from ``True`` to ``False``. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal. (:issue:`36695`, :issue:`24804`)
- Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`)
- Changed behavior of :meth:`DataFrame.max`, :class:`DataFrame.min`, :class:`DataFrame.mean`, :class:`DataFrame.median`, :class:`DataFrame.skew`, :class:`DataFrame.kurt` with ``axis=None`` to return a scalar applying the aggregation across both axes (:issue:`45072`)
- Changed behavior of comparison of a :class:`Timestamp` with a ``datetime.date`` object; these now compare as un-equal and raise on inequality comparisons, matching the ``datetime.datetime`` behavior (:issue:`36131`)
- Changed behavior of comparison of ``NaT`` with a ``datetime.date`` object; these now raise on inequality comparisons (:issue:`39196`)
- Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`)
Expand Down
10 changes: 6 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10357,9 +10357,8 @@ def _reduce(
assert filter_type is None or filter_type == "bool", filter_type
out_dtype = "bool" if filter_type == "bool" else None

# TODO: Make other agg func handle axis=None properly GH#21597
axis = self._get_axis_number(axis)
assert axis in [0, 1]
if axis is not None:
axis = self._get_axis_number(axis)

def func(values: np.ndarray):
# We only use this in the case that operates on self.values
Expand Down Expand Up @@ -10410,7 +10409,7 @@ def _get_data() -> DataFrame:

return out

assert not numeric_only and axis == 1
assert not numeric_only and axis in (1, None)

data = self
values = data.values
Expand All @@ -10426,6 +10425,9 @@ def _get_data() -> DataFrame:
# try to coerce to the original dtypes item by item if we can
pass

if axis is None:
return result

labels = self._get_agg_axis(axis)
result = self._constructor_sliced(result, index=labels)
return result
Expand Down
51 changes: 20 additions & 31 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10944,7 +10944,7 @@ def _stat_function(
self,
name: str,
func,
axis: Axis | None | lib.NoDefault = None,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -10956,30 +10956,13 @@ def _stat_function(

validate_bool_kwarg(skipna, "skipna", none_allowed=False)

if axis is None and self.ndim > 1:
# user must have explicitly passed axis=None
# GH#21597
warnings.warn(
f"In a future version, DataFrame.{name}(axis=None) will return a "
f"scalar {name} over the entire DataFrame. To retain the old "
f"behavior, use 'frame.{name}(axis=0)' or just 'frame.{name}()'",
FutureWarning,
stacklevel=find_stack_level(),
)

if axis is lib.no_default:
axis = None

if axis is None:
axis = self._stat_axis_number

return self._reduce(
func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
)

def min(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -10995,7 +10978,7 @@ def min(

def max(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11011,7 +10994,7 @@ def max(

def mean(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11022,7 +11005,7 @@ def mean(

def median(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11033,7 +11016,7 @@ def median(

def skew(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11044,7 +11027,7 @@ def skew(

def kurt(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand Down Expand Up @@ -11366,7 +11349,7 @@ def prod(
)
def mean(
self,
axis: AxisInt | None | lib.NoDefault = lib.no_default,
axis: AxisInt | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11387,7 +11370,7 @@ def mean(
)
def skew(
self,
axis: AxisInt | None | lib.NoDefault = lib.no_default,
axis: AxisInt | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11411,7 +11394,7 @@ def skew(
)
def kurt(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11433,7 +11416,7 @@ def kurt(
)
def median(
self,
axis: AxisInt | None | lib.NoDefault = lib.no_default,
axis: AxisInt | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11456,7 +11439,7 @@ def median(
)
def max(
self,
axis: AxisInt | None | lib.NoDefault = lib.no_default,
axis: AxisInt | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11479,7 +11462,7 @@ def max(
)
def min(
self,
axis: AxisInt | None | lib.NoDefault = lib.no_default,
axis: AxisInt | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand Down Expand Up @@ -11708,6 +11691,12 @@ def _doc_params(cls):
axis : {axis_descr}
Axis for the function to be applied on.
For `Series` this parameter is unused and defaults to 0.
For DataFrames, specifying ``axis=None`` will apply the aggregation
across both axes.
.. versionadded:: 2.0.0
skipna : bool, default True
Exclude NA/null values when computing the result.
numeric_only : bool, default False
Expand All @@ -11719,7 +11708,7 @@ def _doc_params(cls):
Returns
-------
{name1} or {name2} (if level specified)\
{name1} or scalar\
{see_also}\
{examples}
"""
Expand Down
31 changes: 16 additions & 15 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1488,7 +1488,6 @@ def test_median_categorical_dtype_nuisance_column(self):
# TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead
# of expected.values

@pytest.mark.filterwarnings("ignore:.*will return a scalar.*:FutureWarning")
@pytest.mark.parametrize("method", ["min", "max"])
def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
# GH#28949 DataFrame.min should behave like Series.min
Expand All @@ -1510,15 +1509,15 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
getattr(df, method)()

with pytest.raises(TypeError, match="is not ordered for operation"):
getattr(np, method)(df)
getattr(np, method)(df, axis=0)

# same thing, but with an additional non-categorical column
df["B"] = df["A"].astype(object)
with pytest.raises(TypeError, match="is not ordered for operation"):
getattr(df, method)()

with pytest.raises(TypeError, match="is not ordered for operation"):
getattr(np, method)(df)
getattr(np, method)(df, axis=0)


def test_sum_timedelta64_skipna_false(using_array_manager, request):
Expand Down Expand Up @@ -1600,20 +1599,22 @@ def test_prod_sum_min_count_mixed_object():


@pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
def test_reduction_axis_none_deprecation(method):
# GH#21597 deprecate axis=None defaulting to axis=0 so that we can change it
# to reducing over all axes.
def test_reduction_axis_none_returns_scalar(method):
# GH#21597 As of 2.0, axis=None reduces over all axes.

df = DataFrame(np.random.randn(4, 4))
meth = getattr(df, method)

msg = f"scalar {method} over the entire DataFrame"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = meth(axis=None)
with tm.assert_produces_warning(None):
expected = meth()
tm.assert_series_equal(res, expected)
tm.assert_series_equal(res, meth(axis=0))

result = getattr(df, method)(axis=None)
np_arr = df.to_numpy()
if method in {"skew", "kurt"}:
comp_mod = pytest.importorskip("scipy.stats")
if method == "kurt":
method = "kurtosis"
expected = getattr(comp_mod, method)(np_arr, bias=False, axis=None)
tm.assert_almost_equal(result, expected)
else:
expected = getattr(np, method)(np_arr, axis=None)
assert result == expected


@pytest.mark.parametrize(
Expand Down
8 changes: 2 additions & 6 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,7 @@ def f(x):
tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]])

gbc = df.groupby(c, observed=False)
with tm.assert_produces_warning(
FutureWarning, match="scalar max", check_stacklevel=False
):
# stacklevel is thrown off (i think) bc the stack goes through numpy C code
result = gbc.transform(lambda xs: np.max(xs))
result = gbc.transform(lambda xs: np.max(xs, axis=0))
tm.assert_frame_equal(result, df[["a"]])

with tm.assert_produces_warning(None):
Expand Down Expand Up @@ -295,7 +291,7 @@ def test_apply(ordered):
idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"])

result = grouped.apply(lambda x: np.mean(x))
result = grouped.apply(lambda x: np.mean(x, axis=0))
tm.assert_frame_equal(result, expected)

result = grouped.mean()
Expand Down
16 changes: 4 additions & 12 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,28 +80,20 @@ def test_builtins_apply(keys, f):
assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
assert result.shape == (ngroups, 3), assert_msg

npfunc = getattr(np, fname) # numpy's equivalent function
if f in [max, min]:
warn = FutureWarning
else:
warn = None
msg = "scalar (max|min) over the entire DataFrame"
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
# stacklevel can be thrown off because (i think) the stack
# goes through some of numpy's C code.
expected = gb.apply(npfunc)
npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function
expected = gb.apply(npfunc)
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(None):
expected2 = gb.apply(lambda x: npfunc(x, axis=0))
expected2 = gb.apply(lambda x: npfunc(x))
tm.assert_frame_equal(result, expected2)

if f != sum:
expected = gb.agg(fname).reset_index()
expected.set_index(keys, inplace=True, drop=False)
tm.assert_frame_equal(result, expected, check_dtype=False)

tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)())
tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0))


class TestNumericOnly:
Expand Down
17 changes: 5 additions & 12 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_transform():
tm.assert_frame_equal(result, expected)

def demean(arr):
return arr - arr.mean()
return arr - arr.mean(axis=0)

people = DataFrame(
np.random.randn(5, 5),
Expand Down Expand Up @@ -144,7 +144,7 @@ def test_transform_broadcast(tsframe, ts):
result = grouped.transform(np.mean)
tm.assert_index_equal(result.index, tsframe.index)
for _, gp in grouped:
agged = gp.mean()
agged = gp.mean(axis=0)
res = result.reindex(gp.index)
for col in tsframe:
assert_fp_equal(res[col], agged[col])
Expand Down Expand Up @@ -214,7 +214,7 @@ def test_transform_axis_ts(tsframe):
ts = tso
grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
result = ts - grouped.transform("mean")
expected = grouped.apply(lambda x: x - x.mean())
expected = grouped.apply(lambda x: x - x.mean(axis=0))
tm.assert_frame_equal(result, expected)

ts = ts.T
Expand All @@ -227,7 +227,7 @@ def test_transform_axis_ts(tsframe):
ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
result = ts - grouped.transform("mean")
expected = grouped.apply(lambda x: x - x.mean())
expected = grouped.apply(lambda x: x - x.mean(axis=0))
tm.assert_frame_equal(result, expected)

ts = ts.T
Expand Down Expand Up @@ -477,16 +477,9 @@ def test_transform_coercion():

expected = g.transform(np.mean)

# in 2.0 np.mean on a DataFrame is equivalent to frame.mean(axis=None)
# which not gives a scalar instead of Series
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = g.transform(lambda x: np.mean(x))
result = g.transform(lambda x: np.mean(x, axis=0))
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(None):
result2 = g.transform(lambda x: np.mean(x, axis=0))
tm.assert_frame_equal(result2, expected)


def test_groupby_transform_with_int():

Expand Down

0 comments on commit 47c9ee7

Please sign in to comment.