From 997803893c8b3b64ee4234b9e74b3ca51fe25ec2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 30 Oct 2022 13:07:29 -0400 Subject: [PATCH 1/4] DEPR: Enforce deprecation of silent dropping of nuisance columns in agg_list_like --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/apply.py | 82 ++----------------- pandas/core/groupby/generic.py | 2 +- pandas/tests/apply/test_frame_apply.py | 71 ++++++++-------- .../tests/groupby/aggregate/test_aggregate.py | 17 ++-- pandas/tests/groupby/aggregate/test_other.py | 24 ++---- pandas/tests/groupby/test_groupby.py | 62 +++++++++++--- pandas/tests/resample/test_resample_api.py | 14 ++-- 8 files changed, 119 insertions(+), 155 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index f86c5099a685f..976f91a7bde1b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -275,7 +275,7 @@ Removal of prior version deprecations/changes - Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`) - Removed the deprecated ``base`` and ``loffset`` arguments from :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample` and :class:`pandas.Grouper`. Use ``offset`` or ``origin`` instead (:issue:`31809`) - Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`) -- +- Change behavior of :meth:`DataFrame.apply` with list-like so that any partial failure will raise an error (:issue:`43740`) .. --------------------------------------------------------------------------- .. _whatsnew_200.performance: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 4f9af2d0c01d6..c04f9fe2fae44 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -4,7 +4,6 @@ from collections import defaultdict from functools import partial import inspect -import re from typing import ( TYPE_CHECKING, Any, @@ -35,10 +34,7 @@ NDFrameT, npt, ) -from pandas.errors import ( - DataError, - SpecificationError, -) +from pandas.errors import SpecificationError from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level @@ -345,88 +341,28 @@ def agg_list_like(self) -> DataFrame | Series: results = [] keys = [] - failed_names = [] - - depr_nuisance_columns_msg = ( - "{} did not aggregate successfully. If any error is " - "raised this will raise in a future version of pandas. " - "Drop these columns/ops to avoid this warning." - ) # degenerate case if selected_obj.ndim == 1: for a in arg: colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) - try: - new_res = colg.aggregate(a) + new_res = colg.aggregate(a) + results.append(new_res) - except TypeError: - failed_names.append(com.get_callable_name(a) or a) - else: - results.append(new_res) - - # make sure we find a good name - name = com.get_callable_name(a) or a - keys.append(name) + # make sure we find a good name + name = com.get_callable_name(a) or a + keys.append(name) # multiples else: indices = [] for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) - try: - # Capture and suppress any warnings emitted by us in the call - # to agg below, but pass through any warnings that were - # generated otherwise. - # This is necessary because of https://bugs.python.org/issue29672 - # See GH #43741 for more details - with warnings.catch_warnings(record=True) as record: - new_res = colg.aggregate(arg) - if len(record) > 0: - match = re.compile(depr_nuisance_columns_msg.format(".*")) - for warning in record: - if re.match(match, str(warning.message)): - failed_names.append(col) - else: - warnings.warn_explicit( - message=warning.message, - category=warning.category, - filename=warning.filename, - lineno=warning.lineno, - ) - - except (TypeError, DataError): - failed_names.append(col) - except ValueError as err: - # cannot aggregate - if "Must produce aggregated value" in str(err): - # raised directly in _aggregate_named - failed_names.append(col) - elif "no results" in str(err): - # reached in test_frame_apply.test_nuiscance_columns - # where the colg.aggregate(arg) ends up going through - # the selected_obj.ndim == 1 branch above with arg == ["sum"] - # on a datetime64[ns] column - failed_names.append(col) - else: - raise - else: - results.append(new_res) - indices.append(index) - + new_res = colg.aggregate(arg) + results.append(new_res) + indices.append(index) keys = selected_obj.columns.take(indices) - # if we are empty - if not len(results): - raise ValueError("no results") - - if len(failed_names) > 0: - warnings.warn( - depr_nuisance_columns_msg.format(failed_names), - FutureWarning, - stacklevel=find_stack_level(), - ) - try: concatenated = concat(results, keys=keys, axis=1, sort=False) except TypeError as err: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4c06ee60d3f6a..616e0dc419da6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1141,7 +1141,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) result = gba.agg() except ValueError as err: - if "no results" not in str(err): + if "No objects to concatenate" not in str(err): # raised directly by _aggregate_multiple_funcs raise result = self._aggregate_frame(func) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index c6294cfc0c670..510d4ab702fdd 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1141,14 +1141,13 @@ def test_agg_with_name_as_column_name(): tm.assert_series_equal(result, expected) -def test_agg_multiple_mixed_no_warning(): +def test_agg_multiple_mixed(): # GH 20909 mdf = DataFrame( { "A": [1, 2, 3], "B": [1.0, 2.0, 3.0], "C": ["foo", "bar", "baz"], - "D": date_range("20130101", periods=3), } ) expected = DataFrame( @@ -1156,29 +1155,41 @@ def test_agg_multiple_mixed_no_warning(): "A": [1, 6], "B": [1.0, 6.0], "C": ["bar", "foobarbaz"], - "D": [Timestamp("2013-01-01"), pd.NaT], }, index=["min", "sum"], ) # sorted index - with tm.assert_produces_warning( - FutureWarning, match=r"\['D'\] did not aggregate successfully" - ): - result = mdf.agg(["min", "sum"]) - + result = mdf.agg(["min", "sum"]) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning( - FutureWarning, match=r"\['D'\] did not aggregate successfully" - ): - result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) - + result = mdf[["C", "B", "A"]].agg(["sum", "min"]) # GH40420: the result of .agg should have an index that is sorted # according to the arguments provided to agg. - expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"]) + expected = expected[["C", "B", "A"]].reindex(["sum", "min"]) tm.assert_frame_equal(result, expected) +def test_agg_multiple_mixed_raises(): + # GH 20909 + mdf = DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": date_range("20130101", periods=3), + } + ) + + # sorted index + # TODO: GH#49399 will fix error message + msg = "DataFrame constructor called with" + with pytest.raises(TypeError, match=msg): + mdf.agg(["min", "sum"]) + + with pytest.raises(TypeError, match=msg): + mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) + + def test_agg_reduce(axis, float_frame): other_axis = 1 if axis in {0, "index"} else 0 name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() @@ -1277,14 +1288,10 @@ def test_nuiscance_columns(): expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning( - FutureWarning, match=r"\['D'\] did not aggregate successfully" - ): - result = df.agg(["sum"]) - expected = DataFrame( - [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] - ) - tm.assert_frame_equal(result, expected) + # TODO: GH#49399 will fix error message + msg = "DataFrame constructor called with" + with pytest.raises(TypeError, match=msg): + df.agg(["sum"]) @pytest.mark.parametrize("how", ["agg", "apply"]) @@ -1499,27 +1506,23 @@ def test_aggregation_func_column_order(): # according to the arguments provided to agg. df = DataFrame( [ - ("1", 1, 0, 0), - ("2", 2, 0, 0), - ("3", 3, 0, 0), - ("4", 4, 5, 4), - ("5", 5, 6, 6), - ("6", 6, 7, 7), + (1, 0, 0), + (2, 0, 0), + (3, 0, 0), + (4, 5, 4), + (5, 6, 6), + (6, 7, 7), ], - columns=("item", "att1", "att2", "att3"), + columns=("att1", "att2", "att3"), ) def foo(s): return s.sum() / 2 aggs = ["sum", foo, "count", "min"] - with tm.assert_produces_warning( - FutureWarning, match=r"\['item'\] did not aggregate successfully" - ): - result = df.agg(aggs) + result = df.agg(aggs) expected = DataFrame( { - "item": ["123456", np.nan, 6, "1"], "att1": [21.0, 10.5, 6.0, 1.0], "att2": [18.0, 9.0, 6.0, 0.0], "att3": [17.0, 8.5, 6.0, 0.0], diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ad7368a69c0f5..9514d4c95b394 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -383,21 +383,18 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(): def test_multiple_functions_tuples_and_non_tuples(df): # #1359 + # Columns B and C would cause partial failure + df = df.drop(columns=["B", "C"]) + funcs = [("foo", "mean"), "std"] ex_funcs = [("foo", "mean"), ("std", "std")] - result = df.groupby("A")["C"].agg(funcs) - expected = df.groupby("A")["C"].agg(ex_funcs) + result = df.groupby("A")["D"].agg(funcs) + expected = df.groupby("A")["D"].agg(ex_funcs) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning( - FutureWarning, match=r"\['B'\] did not aggregate successfully" - ): - result = df.groupby("A").agg(funcs) - with tm.assert_produces_warning( - FutureWarning, match=r"\['B'\] did not aggregate successfully" - ): - expected = df.groupby("A").agg(ex_funcs) + result = df.groupby("A").agg(funcs) + expected = df.groupby("A").agg(ex_funcs) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index f84abecea37da..6740729d038a7 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -25,10 +25,8 @@ from pandas.io.formats.printing import pprint_thing -def test_agg_api(): - # GH 6337 - # https://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error - # different api for agg when passed custom function with mixed frame +def test_agg_partial_failure_raises(): + # GH#43741 df = DataFrame( { @@ -43,19 +41,11 @@ def test_agg_api(): def peak_to_peak(arr): return arr.max() - arr.min() - with tm.assert_produces_warning( - FutureWarning, - match=r"\['key2'\] did not aggregate successfully", - ): - expected = grouped.agg([peak_to_peak]) - expected.columns = ["data1", "data2"] - - with tm.assert_produces_warning( - FutureWarning, - match=r"\['key2'\] did not aggregate successfully", - ): - result = grouped.agg(peak_to_peak) - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="unsupported operand type"): + grouped.agg([peak_to_peak]) + + with pytest.raises(TypeError, match="unsupported operand type"): + grouped.agg(peak_to_peak) def test_agg_datetimes_mixed(): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 26f269d3d4384..f4d8fc55d8d46 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -502,6 +502,54 @@ def test_multi_key_multiple_functions(df): def test_frame_multi_key_function_list(): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + grouped = data.groupby(["A", "B"]) + funcs = [np.mean, np.std] + agged = grouped.agg(funcs) + expected = pd.concat( + [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], + keys=["D", "E", "F"], + axis=1, + ) + assert isinstance(agged.index, MultiIndex) + assert isinstance(expected.index, MultiIndex) + tm.assert_frame_equal(agged, expected) + + +def test_frame_multi_key_function_list_partial_failure(): data = DataFrame( { "A": [ @@ -551,18 +599,8 @@ def test_frame_multi_key_function_list(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] - with tm.assert_produces_warning( - FutureWarning, match=r"\['C'\] did not aggregate successfully" - ): - agged = grouped.agg(funcs) - expected = pd.concat( - [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], - keys=["D", "E", "F"], - axis=1, - ) - assert isinstance(agged.index, MultiIndex) - assert isinstance(expected.index, MultiIndex) - tm.assert_frame_equal(agged, expected) + with pytest.raises(TypeError, match="Could not convert dullshinyshiny to numeric"): + grouped.agg(funcs) @pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()]) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index c5cd777962df3..53d416a74cac2 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -407,14 +407,14 @@ def test_agg(): expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: # In case 2, "date" is an index and a column, so agg still tries to agg - warn = FutureWarning if t == cases[2] else None - with tm.assert_produces_warning( - warn, - match=r"\['date'\] did not aggregate successfully", - ): - # .var on dt64 column raises and is dropped + if t == cases[2]: + # .var on dt64 column raises + msg = "Cannot cast DatetimeArray to dtype float64" + with pytest.raises(TypeError, match=msg): + t.aggregate([np.mean, np.std]) + else: result = t.aggregate([np.mean, np.std]) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) for t in cases: From 1d62c3cf67974c5931e670cce5ff0bea3d9f88c9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 30 Oct 2022 13:10:50 -0400 Subject: [PATCH 2/4] Remove type-ignore --- pandas/core/apply.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c04f9fe2fae44..41caa5e4110ff 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -443,10 +443,8 @@ def agg_dict_like(self) -> DataFrame | Series: keys_to_use = ktu axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 - # error: Key expression in dictionary comprehension has incompatible type - # "Hashable"; expected type "NDFrame" [misc] result = concat( - {k: results[k] for k in keys_to_use}, # type: ignore[misc] + {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use, ) From 75fbf422295eb61ef5b75bfd19d32ae7ae2544dd Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 31 Oct 2022 17:37:53 -0400 Subject: [PATCH 3/4] Fixups --- asv_bench/benchmarks/groupby.py | 2 +- doc/source/user_guide/basics.rst | 28 ---------------------------- doc/source/user_guide/groupby.rst | 2 +- doc/source/whatsnew/v0.20.0.rst | 9 ++++++--- pandas/core/apply.py | 2 +- 5 files changed, 9 insertions(+), 34 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 2225cbd74d718..0f0382eaf1584 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -310,7 +310,7 @@ def time_different_python_functions_multicol(self, df): df.groupby(["key1", "key2"]).agg([sum, min, max]) def time_different_python_functions_singlecol(self, df): - df.groupby("key1").agg([sum, min, max]) + df.groupby("key1")[["value1", "value2", "value3"]].agg([sum, min, max]) class GroupStrings: diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 0883113474f54..2204c8b04e438 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1039,34 +1039,6 @@ not noted for a particular column will be ``NaN``: tsdf.agg({"A": ["mean", "min"], "B": "sum"}) -.. _basics.aggregation.mixed_string: - -Mixed dtypes -++++++++++++ - -.. deprecated:: 1.4.0 - Attempting to determine which columns cannot be aggregated and silently dropping them from the results is deprecated and will be removed in a future version. If any porition of the columns or operations provided fail, the call to ``.agg`` will raise. - -When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid -aggregations. This is similar to how ``.groupby.agg`` works. - -.. ipython:: python - - mdf = pd.DataFrame( - { - "A": [1, 2, 3], - "B": [1.0, 2.0, 3.0], - "C": ["foo", "bar", "baz"], - "D": pd.date_range("20130101", periods=3), - } - ) - mdf.dtypes - -.. ipython:: python - :okwarning: - - mdf.agg(["min", "sum"]) - .. _basics.aggregation.custom_describe: Custom describe diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index f9b8b793bfde8..dae42dd4f1118 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1007,7 +1007,7 @@ functions: .. ipython:: python :okwarning: - grouped = df.groupby("A") + grouped = df.groupby("A")[["C", "D"]] grouped.agg(lambda x: x.std()) But, it's rather verbose and can be untidy if you need to pass additional diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index faf4b1ac44d5b..b41a469fe0c1f 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -104,10 +104,13 @@ aggregations. This is similar to how groupby ``.agg()`` works. (:issue:`15015`) 'D': pd.date_range('20130101', periods=3)}) df.dtypes -.. ipython:: python - :okwarning: +.. code-block:: python - df.agg(['min', 'sum']) + In [10]: df.agg(['min', 'sum']) + Out[10]: + A B C D + min 1 1.0 bar 2013-01-01 + sum 6 6.0 foobarbaz NaT .. _whatsnew_0200.enhancements.dataio_dtype: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 597f6e4e01072..67edde2feafbe 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -417,7 +417,7 @@ def agg_dict_like(self) -> DataFrame | Series: axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 result = concat( - {k: results[k] for k in keys_to_use}, + {k: results[k] for k in keys_to_use}, # type: ignore[misc] axis=axis, keys=keys_to_use, ) From 42e10c62408bdec442a6d8ea79dd4ae52552fe22 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 1 Nov 2022 16:21:15 -0400 Subject: [PATCH 4/4] Remove outdated comment --- pandas/core/groupby/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 616e0dc419da6..28b5e944174a8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1142,7 +1142,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) except ValueError as err: if "No objects to concatenate" not in str(err): - # raised directly by _aggregate_multiple_funcs raise result = self._aggregate_frame(func)