Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: Enforce deprecation of silent dropping of nuisance columns in agg_list_like #49401

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def time_different_python_functions_multicol(self, df):
df.groupby(["key1", "key2"]).agg([sum, min, max])

def time_different_python_functions_singlecol(self, df):
df.groupby("key1").agg([sum, min, max])
df.groupby("key1")[["value1", "value2", "value3"]].agg([sum, min, max])


class GroupStrings:
Expand Down
28 changes: 0 additions & 28 deletions doc/source/user_guide/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1039,34 +1039,6 @@ not noted for a particular column will be ``NaN``:

tsdf.agg({"A": ["mean", "min"], "B": "sum"})

.. _basics.aggregation.mixed_string:

Mixed dtypes
++++++++++++

.. deprecated:: 1.4.0
Attempting to determine which columns cannot be aggregated and silently dropping them from the results is deprecated and will be removed in a future version. If any porition of the columns or operations provided fail, the call to ``.agg`` will raise.

When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid
aggregations. This is similar to how ``.groupby.agg`` works.

.. ipython:: python

mdf = pd.DataFrame(
{
"A": [1, 2, 3],
"B": [1.0, 2.0, 3.0],
"C": ["foo", "bar", "baz"],
"D": pd.date_range("20130101", periods=3),
}
)
mdf.dtypes

.. ipython:: python
:okwarning:

mdf.agg(["min", "sum"])

.. _basics.aggregation.custom_describe:

Custom describe
Expand Down
2 changes: 1 addition & 1 deletion doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,7 @@ functions:
.. ipython:: python
:okwarning:

grouped = df.groupby("A")
grouped = df.groupby("A")[["C", "D"]]
grouped.agg(lambda x: x.std())

But, it's rather verbose and can be untidy if you need to pass additional
Expand Down
9 changes: 6 additions & 3 deletions doc/source/whatsnew/v0.20.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,13 @@ aggregations. This is similar to how groupby ``.agg()`` works. (:issue:`15015`)
'D': pd.date_range('20130101', periods=3)})
df.dtypes

.. ipython:: python
:okwarning:
.. code-block:: python

df.agg(['min', 'sum'])
In [10]: df.agg(['min', 'sum'])
Out[10]:
A B C D
min 1 1.0 bar 2013-01-01
sum 6 6.0 foobarbaz NaT

.. _whatsnew_0200.enhancements.dataio_dtype:

Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ Removal of prior version deprecations/changes
- Removed the deprecated ``base`` and ``loffset`` arguments from :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample` and :class:`pandas.Grouper`. Use ``offset`` or ``origin`` instead (:issue:`31809`)
- Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`)
- Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`)
-
- Change behavior of :meth:`DataFrame.apply` with list-like so that any partial failure will raise an error (:issue:`43740`)

.. ---------------------------------------------------------------------------
.. _whatsnew_200.performance:
Expand Down
86 changes: 9 additions & 77 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from collections import defaultdict
from functools import partial
import inspect
import re
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -18,7 +17,6 @@
Sequence,
cast,
)
import warnings

import numpy as np

Expand All @@ -35,12 +33,8 @@
NDFrameT,
npt,
)
from pandas.errors import (
DataError,
SpecificationError,
)
from pandas.errors import SpecificationError
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import is_nested_object
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -320,88 +314,28 @@ def agg_list_like(self) -> DataFrame | Series:

results = []
keys = []
failed_names = []

depr_nuisance_columns_msg = (
"{} did not aggregate successfully. If any error is "
"raised this will raise in a future version of pandas. "
"Drop these columns/ops to avoid this warning."
)

# degenerate case
if selected_obj.ndim == 1:
for a in arg:
colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
try:
new_res = colg.aggregate(a)

except TypeError:
failed_names.append(com.get_callable_name(a) or a)
else:
results.append(new_res)
new_res = colg.aggregate(a)
results.append(new_res)

# make sure we find a good name
name = com.get_callable_name(a) or a
keys.append(name)
# make sure we find a good name
name = com.get_callable_name(a) or a
keys.append(name)

# multiples
else:
indices = []
for index, col in enumerate(selected_obj):
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
try:
# Capture and suppress any warnings emitted by us in the call
# to agg below, but pass through any warnings that were
# generated otherwise.
# This is necessary because of https://bugs.python.org/issue29672
# See GH #43741 for more details
with warnings.catch_warnings(record=True) as record:
new_res = colg.aggregate(arg)
if len(record) > 0:
match = re.compile(depr_nuisance_columns_msg.format(".*"))
for warning in record:
if re.match(match, str(warning.message)):
failed_names.append(col)
else:
warnings.warn_explicit(
message=warning.message,
category=warning.category,
filename=warning.filename,
lineno=warning.lineno,
)

except (TypeError, DataError):
failed_names.append(col)
except ValueError as err:
# cannot aggregate
if "Must produce aggregated value" in str(err):
# raised directly in _aggregate_named
failed_names.append(col)
elif "no results" in str(err):
# reached in test_frame_apply.test_nuiscance_columns
# where the colg.aggregate(arg) ends up going through
# the selected_obj.ndim == 1 branch above with arg == ["sum"]
# on a datetime64[ns] column
failed_names.append(col)
else:
raise
else:
results.append(new_res)
indices.append(index)

new_res = colg.aggregate(arg)
results.append(new_res)
indices.append(index)
keys = selected_obj.columns.take(indices)

# if we are empty
if not len(results):
raise ValueError("no results")

if len(failed_names) > 0:
warnings.warn(
depr_nuisance_columns_msg.format(failed_names),
FutureWarning,
stacklevel=find_stack_level(),
)

try:
concatenated = concat(results, keys=keys, axis=1, sort=False)
except TypeError as err:
Expand Down Expand Up @@ -482,8 +416,6 @@ def agg_dict_like(self) -> DataFrame | Series:
keys_to_use = ktu

axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1
# error: Key expression in dictionary comprehension has incompatible type
# "Hashable"; expected type "NDFrame" [misc]
result = concat(
{k: results[k] for k in keys_to_use}, # type: ignore[misc]
axis=axis,
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1141,8 +1141,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
result = gba.agg()

except ValueError as err:
if "no results" not in str(err):
# raised directly by _aggregate_multiple_funcs
if "No objects to concatenate" not in str(err):
raise
result = self._aggregate_frame(func)

Expand Down
71 changes: 37 additions & 34 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1141,44 +1141,55 @@ def test_agg_with_name_as_column_name():
tm.assert_series_equal(result, expected)


def test_agg_multiple_mixed_no_warning():
def test_agg_multiple_mixed():
# GH 20909
mdf = DataFrame(
{
"A": [1, 2, 3],
"B": [1.0, 2.0, 3.0],
"C": ["foo", "bar", "baz"],
"D": date_range("20130101", periods=3),
}
)
expected = DataFrame(
{
"A": [1, 6],
"B": [1.0, 6.0],
"C": ["bar", "foobarbaz"],
"D": [Timestamp("2013-01-01"), pd.NaT],
},
index=["min", "sum"],
)
# sorted index
with tm.assert_produces_warning(
FutureWarning, match=r"\['D'\] did not aggregate successfully"
):
result = mdf.agg(["min", "sum"])

result = mdf.agg(["min", "sum"])
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(
FutureWarning, match=r"\['D'\] did not aggregate successfully"
):
result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"])

result = mdf[["C", "B", "A"]].agg(["sum", "min"])
# GH40420: the result of .agg should have an index that is sorted
# according to the arguments provided to agg.
expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"])
expected = expected[["C", "B", "A"]].reindex(["sum", "min"])
tm.assert_frame_equal(result, expected)


def test_agg_multiple_mixed_raises():
# GH 20909
mdf = DataFrame(
{
"A": [1, 2, 3],
"B": [1.0, 2.0, 3.0],
"C": ["foo", "bar", "baz"],
"D": date_range("20130101", periods=3),
}
)

# sorted index
# TODO: GH#49399 will fix error message
msg = "DataFrame constructor called with"
with pytest.raises(TypeError, match=msg):
mdf.agg(["min", "sum"])

with pytest.raises(TypeError, match=msg):
mdf[["D", "C", "B", "A"]].agg(["sum", "min"])


def test_agg_reduce(axis, float_frame):
other_axis = 1 if axis in {0, "index"} else 0
name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values()
Expand Down Expand Up @@ -1277,14 +1288,10 @@ def test_nuiscance_columns():
expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"])
tm.assert_series_equal(result, expected)

with tm.assert_produces_warning(
FutureWarning, match=r"\['D'\] did not aggregate successfully"
):
result = df.agg(["sum"])
expected = DataFrame(
[[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"]
)
tm.assert_frame_equal(result, expected)
# TODO: GH#49399 will fix error message
msg = "DataFrame constructor called with"
with pytest.raises(TypeError, match=msg):
df.agg(["sum"])


@pytest.mark.parametrize("how", ["agg", "apply"])
Expand Down Expand Up @@ -1499,27 +1506,23 @@ def test_aggregation_func_column_order():
# according to the arguments provided to agg.
df = DataFrame(
[
("1", 1, 0, 0),
("2", 2, 0, 0),
("3", 3, 0, 0),
("4", 4, 5, 4),
("5", 5, 6, 6),
("6", 6, 7, 7),
(1, 0, 0),
(2, 0, 0),
(3, 0, 0),
(4, 5, 4),
(5, 6, 6),
(6, 7, 7),
],
columns=("item", "att1", "att2", "att3"),
columns=("att1", "att2", "att3"),
)

def foo(s):
return s.sum() / 2

aggs = ["sum", foo, "count", "min"]
with tm.assert_produces_warning(
FutureWarning, match=r"\['item'\] did not aggregate successfully"
):
result = df.agg(aggs)
result = df.agg(aggs)
expected = DataFrame(
{
"item": ["123456", np.nan, 6, "1"],
"att1": [21.0, 10.5, 6.0, 1.0],
"att2": [18.0, 9.0, 6.0, 0.0],
"att3": [17.0, 8.5, 6.0, 0.0],
Expand Down
17 changes: 7 additions & 10 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,21 +383,18 @@ def test_agg_multiple_functions_same_name_with_ohlc_present():

def test_multiple_functions_tuples_and_non_tuples(df):
# #1359
# Columns B and C would cause partial failure
df = df.drop(columns=["B", "C"])

funcs = [("foo", "mean"), "std"]
ex_funcs = [("foo", "mean"), ("std", "std")]

result = df.groupby("A")["C"].agg(funcs)
expected = df.groupby("A")["C"].agg(ex_funcs)
result = df.groupby("A")["D"].agg(funcs)
expected = df.groupby("A")["D"].agg(ex_funcs)
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(
FutureWarning, match=r"\['B'\] did not aggregate successfully"
):
result = df.groupby("A").agg(funcs)
with tm.assert_produces_warning(
FutureWarning, match=r"\['B'\] did not aggregate successfully"
):
expected = df.groupby("A").agg(ex_funcs)
result = df.groupby("A").agg(funcs)
expected = df.groupby("A").agg(ex_funcs)
tm.assert_frame_equal(result, expected)


Expand Down
Loading