diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 5b3e607956f7a..ca8d60051ff90 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -10,6 +10,36 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none +.. _whatsnew_0232.enhancements: + +Logical Reductions over Entire DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`DataFrame.all` and :meth:`DataFrame.any` now accept ``axis=None`` to reduce over all axes to a scalar (:issue:`19976`) + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2], "B": [True, False]}) + df.all(axis=None) + + +This also provides compatibility with NumPy 1.15, which now dispatches to ``DataFrame.all``. +With NumPy 1.15 and pandas 0.23.1 or earlier, :func:`numpy.all` will no longer reduce over every axis: + +.. code-block:: python + + >>> # NumPy 1.15, pandas 0.23.1 + >>> np.any(pd.DataFrame({"A": [False], "B": [False]})) + A False + B False + dtype: bool + +With pandas 0.23.2, that will correctly return False, as it did with NumPy < 1.15. + +.. ipython:: python + + np.any(pd.DataFrame({"A": [False], "B": [False]})) + .. _whatsnew_0232.fixed_regressions: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 74bb2abc27c4b..9884bf9a53478 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6846,13 +6846,18 @@ def _count_level(self, level, axis=0, numeric_only=False): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): - axis = self._get_axis_number(axis) + if axis is None and filter_type == 'bool': + labels = None + constructor = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + labels = self._get_agg_axis(axis) + constructor = self._constructor def f(x): return op(x, axis=axis, skipna=skipna, **kwds) - labels = self._get_agg_axis(axis) - # exclude timedelta/datetime unless we are uniform types if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type: numeric_only = True @@ -6861,6 +6866,13 @@ def f(x): try: values = self.values result = f(values) + + if (filter_type == 'bool' and is_object_dtype(values) and + axis is None): + # work around https://github.com/numpy/numpy/issues/10489 + # TODO: combine with hasattr(result, 'dtype') further down + # hard since we don't have `values` down there. + result = np.bool_(result) except Exception as e: # try by-column first @@ -6927,7 +6939,9 @@ def f(x): if axis == 0: result = coerce_to_dtypes(result, self.dtypes) - return Series(result, index=labels) + if constructor is not None: + result = Series(result, index=labels) + return result def nunique(self, axis=0, dropna=True): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1780e359164e2..bdf2fe350b42d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8727,6 +8727,8 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): + if axis is None: + raise ValueError("Must specify 'axis' when aggregating by level.") grouped = self.groupby(level=level, axis=axis, sort=False) if hasattr(grouped, name) and skipna: return getattr(grouped, name)(**kwargs) @@ -9053,8 +9055,15 @@ def _doc_parms(cls): Parameters ---------- -axis : int, default 0 - Select the axis which can be 0 for indices and 1 for columns. +axis : {0 or 'index', 1 or 'columns', None}, default 0 + Indicate which axis or axes should be reduced. + + * 0 / 'index' : reduce the index, return a Series whose index is the + original column labels. + * 1 / 'columns' : reduce the columns, return a Series whose index is the + original index. + * None : reduce all axes, return a scalar. + skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -9076,9 +9085,9 @@ def _doc_parms(cls): %(examples)s""" _all_doc = """\ -Return whether all elements are True over series or dataframe axis. +Return whether all elements are True, potentially over an axis. -Returns True if all elements within a series or along a dataframe +Returns True if all elements within a series or along a Dataframe axis are non-zero, not-empty or not-False.""" _all_examples = """\ @@ -9091,7 +9100,7 @@ def _doc_parms(cls): >>> pd.Series([True, False]).all() False -Dataframes +DataFrames Create a dataframe from a dictionary. @@ -9108,12 +9117,17 @@ def _doc_parms(cls): col2 False dtype: bool -Adding axis=1 argument will check if row-wise values all return True. +Specify ``axis='columns'`` to check if row-wise values all return True. ->>> df.all(axis=1) +>>> df.all(axis='columns') 0 True 1 False dtype: bool + +Or ``axis=None`` for whether every value is True. + +>>> df.all(axis=None) +False """ _all_see_also = """\ @@ -9479,6 +9493,11 @@ def _doc_parms(cls): 1 False dtype: bool +Aggregating over the entire DataFrame with ``axis=None``. + +>>> df.any(axis=None) +True + `any` for an empty DataFrame is an empty Series. >>> pd.DataFrame([]).any() @@ -9649,22 +9668,17 @@ def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f, @Substitution(outname=name, desc=desc, name1=name1, name2=name2, axis_descr=axis_descr, examples=examples, see_also=see_also) @Appender(_bool_doc) - def logical_func(self, axis=None, bool_only=None, skipna=None, level=None, + def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): nv.validate_logical_func(tuple(), kwargs, fname=name) - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number if level is not None: if bool_only is not None: raise NotImplementedError("Option bool_only is not " "implemented with option level.") return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) - return self._reduce(f, axis=axis, skipna=skipna, - numeric_only=bool_only, filter_type='bool', - name=name) + return self._reduce(f, name, axis=axis, skipna=skipna, + numeric_only=bool_only, filter_type='bool') return set_function_name(logical_func, name, cls) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index c4aa471b8b944..4f7400ad8388b 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1143,13 +1143,26 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, raise NotImplementedError('Panel.{0} does not implement ' 'numeric_only.'.format(name)) - axis_name = self._get_axis_name(axis) - axis_number = self._get_axis_number(axis_name) + if axis is None and filter_type == 'bool': + # labels = None + # constructor = None + axis_number = None + axis_name = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + # labels = self._get_agg_axis(axis) + # constructor = self._constructor + axis_name = self._get_axis_name(axis) + axis_number = self._get_axis_number(axis_name) + f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds) with np.errstate(all='ignore'): result = f(self.values) + if axis is None and filter_type == 'bool': + return np.bool_(result) axes = self._get_plane_axes(axis_name) if result.ndim == 2 and axis_name != self._info_axis_name: result = result.T diff --git a/pandas/core/series.py b/pandas/core/series.py index 2f762dff4aeab..d374ddbf59ad2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3241,7 +3241,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, delegate = self._values if isinstance(delegate, np.ndarray): # Validate that 'axis' is consistent with Series's single axis. - self._get_axis_number(axis) + if axis is not None: + self._get_axis_number(axis) if numeric_only: raise NotImplementedError('Series.{0} does not implement ' 'numeric_only.'.format(name)) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 6dc24ed856017..5f6aec9d882b6 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -15,7 +15,7 @@ from pandas.compat import lrange, PY35 from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, - _np_version_under1p12, _np_version_under1p15, + _np_version_under1p12, to_datetime, to_timedelta) import pandas as pd import pandas.core.nanops as nanops @@ -1159,11 +1159,35 @@ def test_any_all(self): self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True) self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True) - df = DataFrame(randn(10, 4)) > 0 - df.any(1) - df.all(1) - df.any(1, bool_only=True) - df.all(1, bool_only=True) + def test_any_all_extra(self): + df = DataFrame({ + 'A': [True, False, False], + 'B': [True, True, False], + 'C': [True, True, True], + }, index=['a', 'b', 'c']) + result = df[['A', 'B']].any(1) + expected = Series([True, True, False], index=['a', 'b', 'c']) + tm.assert_series_equal(result, expected) + + result = df[['A', 'B']].any(1, bool_only=True) + tm.assert_series_equal(result, expected) + + result = df.all(1) + expected = Series([True, False, False], index=['a', 'b', 'c']) + tm.assert_series_equal(result, expected) + + result = df.all(1, bool_only=True) + tm.assert_series_equal(result, expected) + + # Axis is None + result = df.all(axis=None).item() + assert result is False + + result = df.any(axis=None).item() + assert result is True + + result = df[['C']].all(axis=None).item() + assert result is True # skip pathological failure cases # class CantNonzero(object): @@ -1185,6 +1209,86 @@ def test_any_all(self): # df.any(1, bool_only=True) # df.all(1, bool_only=True) + @pytest.mark.parametrize('func, data, expected', [ + (np.any, {}, False), + (np.all, {}, True), + (np.any, {'A': []}, False), + (np.all, {'A': []}, True), + (np.any, {'A': [False, False]}, False), + (np.all, {'A': [False, False]}, False), + (np.any, {'A': [True, False]}, True), + (np.all, {'A': [True, False]}, False), + (np.any, {'A': [True, True]}, True), + (np.all, {'A': [True, True]}, True), + + (np.any, {'A': [False], 'B': [False]}, False), + (np.all, {'A': [False], 'B': [False]}, False), + + (np.any, {'A': [False, False], 'B': [False, True]}, True), + (np.all, {'A': [False, False], 'B': [False, True]}, False), + + # other types + (np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False), + (np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True), + (np.all, {'A': pd.Series([0, 1], dtype=int)}, False), + (np.any, {'A': pd.Series([0, 1], dtype=int)}, True), + pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + (np.all, {'A': pd.Series([0, 1], dtype='category')}, False), + (np.any, {'A': pd.Series([0, 1], dtype='category')}, True), + (np.all, {'A': pd.Series([1, 2], dtype='category')}, True), + (np.any, {'A': pd.Series([1, 2], dtype='category')}, True), + + # # Mix + # GH-21484 + # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'), + # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True), + ]) + def test_any_all_np_func(self, func, data, expected): + # https://github.com/pandas-dev/pandas/issues/19976 + data = DataFrame(data) + result = func(data) + assert isinstance(result, np.bool_) + assert result.item() is expected + + # method version + result = getattr(DataFrame(data), func.__name__)(axis=None) + assert isinstance(result, np.bool_) + assert result.item() is expected + + def test_any_all_object(self): + # https://github.com/pandas-dev/pandas/issues/19976 + result = np.all(DataFrame(columns=['a', 'b'])).item() + assert result is True + + result = np.any(DataFrame(columns=['a', 'b'])).item() + assert result is False + + @pytest.mark.parametrize('method', ['any', 'all']) + def test_any_all_level_axis_none_raises(self, method): + df = DataFrame( + {"A": 1}, + index=MultiIndex.from_product([['A', 'B'], ['a', 'b']], + names=['out', 'in']) + ) + xpr = "Must specify 'axis' when aggregating by level." + with tm.assert_raises_regex(ValueError, xpr): + getattr(df, method)(axis=None, level='out') + def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, has_bool_only=False): if frame is None: @@ -2074,9 +2178,6 @@ def test_clip_against_list_like(self, inplace, lower, axis, res): result = original tm.assert_frame_equal(result, expected, check_exact=True) - @pytest.mark.xfail( - not _np_version_under1p15, - reason="failing under numpy-dev gh-19976") @pytest.mark.parametrize("axis", [0, 1, None]) def test_clip_against_frame(self, axis): df = DataFrame(np.random.randn(1000, 2)) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index d95a2ad2d7f76..2f8bc228cf86e 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2707,3 +2707,10 @@ def test_panel_index(): np.repeat([1, 2, 3], 4)], names=['time', 'panel']) tm.assert_index_equal(index, expected) + + +def test_panel_np_all(): + with catch_warnings(record=True): + wp = Panel({"A": DataFrame({'b': [1, 2]})}) + result = np.all(wp) + assert result == np.bool_(True) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 89d90258f58e0..27c24e3a68079 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -30,6 +30,7 @@ def test_foo(): from pandas.compat import (is_platform_windows, is_platform_32bit, PY3, import_lzma) +from pandas.compat.numpy import _np_version_under1p15 from pandas.core.computation.expressions import (_USE_NUMEXPR, _NUMEXPR_INSTALLED) @@ -160,6 +161,9 @@ def decorated_func(func): skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), reason="Missing matplotlib dependency") + +skip_if_np_lt_115 = pytest.mark.skipif(_np_version_under1p15, + reason="NumPy 1.15 or greater required") skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present") skip_if_mpl_1_5 = pytest.mark.skipif(_skip_if_mpl_1_5(),