Skip to content

Commit

Permalink
DEPR: Enforce deprecation of silent dropping of nuisance columns in a…
Browse files Browse the repository at this point in the history
…gg_list_like (#49401)

* DEPR: Enforce deprecation of silent dropping of nuisance columns in agg_list_like

* Remove type-ignore

* Fixups

* Remove outdated comment
  • Loading branch information
rhshadrach authored Nov 2, 2022
1 parent cb43a81 commit fb9b345
Show file tree
Hide file tree
Showing 12 changed files with 127 additions and 193 deletions.
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def time_different_python_functions_multicol(self, df):
df.groupby(["key1", "key2"]).agg([sum, min, max])

def time_different_python_functions_singlecol(self, df):
df.groupby("key1").agg([sum, min, max])
df.groupby("key1")[["value1", "value2", "value3"]].agg([sum, min, max])


class GroupStrings:
Expand Down
28 changes: 0 additions & 28 deletions doc/source/user_guide/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1039,34 +1039,6 @@ not noted for a particular column will be ``NaN``:
tsdf.agg({"A": ["mean", "min"], "B": "sum"})
.. _basics.aggregation.mixed_string:

Mixed dtypes
++++++++++++

.. deprecated:: 1.4.0
Attempting to determine which columns cannot be aggregated and silently dropping them from the results is deprecated and will be removed in a future version. If any porition of the columns or operations provided fail, the call to ``.agg`` will raise.

When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid
aggregations. This is similar to how ``.groupby.agg`` works.

.. ipython:: python
mdf = pd.DataFrame(
{
"A": [1, 2, 3],
"B": [1.0, 2.0, 3.0],
"C": ["foo", "bar", "baz"],
"D": pd.date_range("20130101", periods=3),
}
)
mdf.dtypes
.. ipython:: python
:okwarning:
mdf.agg(["min", "sum"])
.. _basics.aggregation.custom_describe:

Custom describe
Expand Down
2 changes: 1 addition & 1 deletion doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,7 @@ functions:
.. ipython:: python
:okwarning:
grouped = df.groupby("A")
grouped = df.groupby("A")[["C", "D"]]
grouped.agg(lambda x: x.std())
But, it's rather verbose and can be untidy if you need to pass additional
Expand Down
9 changes: 6 additions & 3 deletions doc/source/whatsnew/v0.20.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,13 @@ aggregations. This is similar to how groupby ``.agg()`` works. (:issue:`15015`)
'D': pd.date_range('20130101', periods=3)})
df.dtypes
.. ipython:: python
:okwarning:
.. code-block:: python
df.agg(['min', 'sum'])
In [10]: df.agg(['min', 'sum'])
Out[10]:
A B C D
min 1 1.0 bar 2013-01-01
sum 6 6.0 foobarbaz NaT
.. _whatsnew_0200.enhancements.dataio_dtype:

Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ Removal of prior version deprecations/changes
- Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`)
- Changed behavior of comparison of a :class:`Timestamp` with a ``datetime.date`` object; these now compare as un-equal and raise on inequality comparisons, matching the ``datetime.datetime`` behavior (:issue:`36131`)
- Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`)
-
- Change behavior of :meth:`DataFrame.apply` with list-like so that any partial failure will raise an error (:issue:`43740`)

.. ---------------------------------------------------------------------------
.. _whatsnew_200.performance:
Expand Down
86 changes: 9 additions & 77 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from collections import defaultdict
from functools import partial
import inspect
import re
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -18,7 +17,6 @@
Sequence,
cast,
)
import warnings

import numpy as np

Expand All @@ -35,12 +33,8 @@
NDFrameT,
npt,
)
from pandas.errors import (
DataError,
SpecificationError,
)
from pandas.errors import SpecificationError
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import is_nested_object
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -317,88 +311,28 @@ def agg_list_like(self) -> DataFrame | Series:

results = []
keys = []
failed_names = []

depr_nuisance_columns_msg = (
"{} did not aggregate successfully. If any error is "
"raised this will raise in a future version of pandas. "
"Drop these columns/ops to avoid this warning."
)

# degenerate case
if selected_obj.ndim == 1:
for a in arg:
colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
try:
new_res = colg.aggregate(a)

except TypeError:
failed_names.append(com.get_callable_name(a) or a)
else:
results.append(new_res)
new_res = colg.aggregate(a)
results.append(new_res)

# make sure we find a good name
name = com.get_callable_name(a) or a
keys.append(name)
# make sure we find a good name
name = com.get_callable_name(a) or a
keys.append(name)

# multiples
else:
indices = []
for index, col in enumerate(selected_obj):
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
try:
# Capture and suppress any warnings emitted by us in the call
# to agg below, but pass through any warnings that were
# generated otherwise.
# This is necessary because of https://bugs.python.org/issue29672
# See GH #43741 for more details
with warnings.catch_warnings(record=True) as record:
new_res = colg.aggregate(arg)
if len(record) > 0:
match = re.compile(depr_nuisance_columns_msg.format(".*"))
for warning in record:
if re.match(match, str(warning.message)):
failed_names.append(col)
else:
warnings.warn_explicit(
message=warning.message,
category=warning.category,
filename=warning.filename,
lineno=warning.lineno,
)

except (TypeError, DataError):
failed_names.append(col)
except ValueError as err:
# cannot aggregate
if "Must produce aggregated value" in str(err):
# raised directly in _aggregate_named
failed_names.append(col)
elif "no results" in str(err):
# reached in test_frame_apply.test_nuiscance_columns
# where the colg.aggregate(arg) ends up going through
# the selected_obj.ndim == 1 branch above with arg == ["sum"]
# on a datetime64[ns] column
failed_names.append(col)
else:
raise
else:
results.append(new_res)
indices.append(index)

new_res = colg.aggregate(arg)
results.append(new_res)
indices.append(index)
keys = selected_obj.columns.take(indices)

# if we are empty
if not len(results):
raise ValueError("no results")

if len(failed_names) > 0:
warnings.warn(
depr_nuisance_columns_msg.format(failed_names),
FutureWarning,
stacklevel=find_stack_level(),
)

try:
concatenated = concat(results, keys=keys, axis=1, sort=False)
except TypeError as err:
Expand Down Expand Up @@ -479,8 +413,6 @@ def agg_dict_like(self) -> DataFrame | Series:
keys_to_use = ktu

axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1
# error: Key expression in dictionary comprehension has incompatible type
# "Hashable"; expected type "NDFrame" [misc]
result = concat(
{k: results[k] for k in keys_to_use}, # type: ignore[misc]
axis=axis,
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,8 +1138,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
result = gba.agg()

except ValueError as err:
if "no results" not in str(err):
# raised directly by _aggregate_multiple_funcs
if "No objects to concatenate" not in str(err):
raise
result = self._aggregate_frame(func)

Expand Down
71 changes: 37 additions & 34 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1141,44 +1141,55 @@ def test_agg_with_name_as_column_name():
tm.assert_series_equal(result, expected)


def test_agg_multiple_mixed_no_warning():
def test_agg_multiple_mixed():
# GH 20909
mdf = DataFrame(
{
"A": [1, 2, 3],
"B": [1.0, 2.0, 3.0],
"C": ["foo", "bar", "baz"],
"D": date_range("20130101", periods=3),
}
)
expected = DataFrame(
{
"A": [1, 6],
"B": [1.0, 6.0],
"C": ["bar", "foobarbaz"],
"D": [Timestamp("2013-01-01"), pd.NaT],
},
index=["min", "sum"],
)
# sorted index
with tm.assert_produces_warning(
FutureWarning, match=r"\['D'\] did not aggregate successfully"
):
result = mdf.agg(["min", "sum"])

result = mdf.agg(["min", "sum"])
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(
FutureWarning, match=r"\['D'\] did not aggregate successfully"
):
result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"])

result = mdf[["C", "B", "A"]].agg(["sum", "min"])
# GH40420: the result of .agg should have an index that is sorted
# according to the arguments provided to agg.
expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"])
expected = expected[["C", "B", "A"]].reindex(["sum", "min"])
tm.assert_frame_equal(result, expected)


def test_agg_multiple_mixed_raises():
# GH 20909
mdf = DataFrame(
{
"A": [1, 2, 3],
"B": [1.0, 2.0, 3.0],
"C": ["foo", "bar", "baz"],
"D": date_range("20130101", periods=3),
}
)

# sorted index
# TODO: GH#49399 will fix error message
msg = "DataFrame constructor called with"
with pytest.raises(TypeError, match=msg):
mdf.agg(["min", "sum"])

with pytest.raises(TypeError, match=msg):
mdf[["D", "C", "B", "A"]].agg(["sum", "min"])


def test_agg_reduce(axis, float_frame):
other_axis = 1 if axis in {0, "index"} else 0
name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values()
Expand Down Expand Up @@ -1277,14 +1288,10 @@ def test_nuiscance_columns():
expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"])
tm.assert_series_equal(result, expected)

with tm.assert_produces_warning(
FutureWarning, match=r"\['D'\] did not aggregate successfully"
):
result = df.agg(["sum"])
expected = DataFrame(
[[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"]
)
tm.assert_frame_equal(result, expected)
# TODO: GH#49399 will fix error message
msg = "DataFrame constructor called with"
with pytest.raises(TypeError, match=msg):
df.agg(["sum"])


@pytest.mark.parametrize("how", ["agg", "apply"])
Expand Down Expand Up @@ -1499,27 +1506,23 @@ def test_aggregation_func_column_order():
# according to the arguments provided to agg.
df = DataFrame(
[
("1", 1, 0, 0),
("2", 2, 0, 0),
("3", 3, 0, 0),
("4", 4, 5, 4),
("5", 5, 6, 6),
("6", 6, 7, 7),
(1, 0, 0),
(2, 0, 0),
(3, 0, 0),
(4, 5, 4),
(5, 6, 6),
(6, 7, 7),
],
columns=("item", "att1", "att2", "att3"),
columns=("att1", "att2", "att3"),
)

def foo(s):
return s.sum() / 2

aggs = ["sum", foo, "count", "min"]
with tm.assert_produces_warning(
FutureWarning, match=r"\['item'\] did not aggregate successfully"
):
result = df.agg(aggs)
result = df.agg(aggs)
expected = DataFrame(
{
"item": ["123456", np.nan, 6, "1"],
"att1": [21.0, 10.5, 6.0, 1.0],
"att2": [18.0, 9.0, 6.0, 0.0],
"att3": [17.0, 8.5, 6.0, 0.0],
Expand Down
17 changes: 7 additions & 10 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,21 +383,18 @@ def test_agg_multiple_functions_same_name_with_ohlc_present():

def test_multiple_functions_tuples_and_non_tuples(df):
# #1359
# Columns B and C would cause partial failure
df = df.drop(columns=["B", "C"])

funcs = [("foo", "mean"), "std"]
ex_funcs = [("foo", "mean"), ("std", "std")]

result = df.groupby("A")["C"].agg(funcs)
expected = df.groupby("A")["C"].agg(ex_funcs)
result = df.groupby("A")["D"].agg(funcs)
expected = df.groupby("A")["D"].agg(ex_funcs)
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(
FutureWarning, match=r"\['B'\] did not aggregate successfully"
):
result = df.groupby("A").agg(funcs)
with tm.assert_produces_warning(
FutureWarning, match=r"\['B'\] did not aggregate successfully"
):
expected = df.groupby("A").agg(ex_funcs)
result = df.groupby("A").agg(funcs)
expected = df.groupby("A").agg(ex_funcs)
tm.assert_frame_equal(result, expected)


Expand Down
Loading

0 comments on commit fb9b345

Please sign in to comment.