pandas-dev · mroeschke · Jul 13, 2023 · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -40,6 +40,7 @@ objects.
       api.extensions.ExtensionArray._from_sequence_of_strings
       api.extensions.ExtensionArray._hash_pandas_object
       api.extensions.ExtensionArray._reduce
+      api.extensions.ExtensionArray._reduce_and_wrap
       api.extensions.ExtensionArray._values_for_argsort
       api.extensions.ExtensionArray._values_for_factorize
       api.extensions.ExtensionArray.argsort

diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst
@@ -126,10 +126,11 @@ These dtypes can be merged, reshaped & casted.
    pd.concat([df[["A"]], df[["B", "C"]]], axis=1).dtypes
    df["A"].astype(float)
 
-Reduction and groupby operations such as 'sum' work as well.
+Reduction and groupby operations such as :meth:`~DataFrame.sum` work as well.
 
 .. ipython:: python
 
+   df.sum(numeric_only=True)
    df.sum()
    df.groupby("B").A.sum()
 

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -14,10 +14,43 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
-.. _whatsnew_210.enhancements.enhancement1:
+.. _whatsnew_210.enhancements.reduction_extension_dtypes:
 
-enhancement1
-^^^^^^^^^^^^
+DataFrame reductions preserve extension dtypes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions of pandas, the results of DataFrame reductions
+(:meth:`DataFrame.sum` :meth:`DataFrame.mean` etc.) has numpy dtypes even when the DataFrames
+were of extension dtypes. Pandas can now keep the dtypes when doing reductions over Dataframe
+columns with a common dtype (:issue:`52788`).
+
+*Old Behavior*
+
+.. code-block:: ipython
+
+    In [1]: df = pd.DataFrame({"a": [1, 1, 2, 1], "b": [np.nan, 2.0, 3.0, 4.0]}, dtype="Int64")
+    In [2]: df.sum()
+    Out[2]:
+    a    5
+    b    9
+    dtype: int64
+    In [3]: df = df.astype("int64[pyarrow]")
+    In [4]: df.sum()
+    Out[4]:
+    a    5
+    b    9
+    dtype: int64
+
+*New Behavior*
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 1, 2, 1], "b": [np.nan, 2.0, 3.0, 4.0]}, dtype="Int64")
+    df.sum()
+    df = df.astype("int64[pyarrow]")
+    df.sum()
+
+Notice that the dtype is now a masked dtype and pyarrow dtype, respectively, while previously it was a numpy integer dtype.
 
 .. _whatsnew_210.enhancements.enhancement2:
 

diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py
@@ -52,7 +52,7 @@ def _reductions(
     axis : int, optional, default None
     """
     if not skipna:
-        if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count):
+        if mask.any() or check_below_min_count(values.shape, None, min_count):
             return libmissing.NA
         else:
             return func(values, axis=axis, **kwargs)
@@ -119,11 +119,11 @@ def _minmax(
             # min/max with empty array raise in numpy, pandas returns NA
             return libmissing.NA
         else:
-            return func(values)
+            return func(values, axis=axis)
     else:
         subset = values[~mask]
         if subset.size:
-            return func(subset)
+            return func(subset, axis=axis)
         else:
             # min/max with empty array raise in numpy, pandas returns NA
             return libmissing.NA

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1549,6 +1549,12 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
 
         return result.as_py()
 
+    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
+        """Takes the result of ``_reduce`` and wraps it an a ndarray/extensionArray."""
+        result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
+        result = pa.array([result.as_py()], type=result.type)
+        return type(self)(result)
+
     def __setitem__(self, key, value) -> None:
         """Set one or more values inplace.
 

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -136,6 +136,7 @@ class ExtensionArray:
     _from_sequence_of_strings
     _hash_pandas_object
     _reduce
+    _reduce_and_wrap
     _values_for_argsort
     _values_for_factorize
 
@@ -184,6 +185,7 @@ class ExtensionArray:
 
     * _accumulate
     * _reduce
+    * _reduce_and_wrap
 
     One can implement methods to handle parsing from strings that will be used
     in methods such as ``pandas.io.parsers.read_csv``.
@@ -1425,6 +1427,11 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
         Raises
         ------
         TypeError : subclass does not define reductions
+
+        See Also
+        --------
+        ExtensionArray._reduce_and_wrap
+            Calls ``_reduce`` and wraps the result in a ndarray/ExtensionArray.
         """
         meth = getattr(self, name, None)
         if meth is None:
@@ -1434,6 +1441,28 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
             )
         return meth(skipna=skipna, **kwargs)
 
+    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
+        """
+        Call ``_reduce`` and wrap the result in a ndarray/ExtensionArray.
+
+        This is used to control the returned dtype when doing reductions in DataFrames,
+        and ensures the correct dtype for e.g. ``DataFrame({"a": extr_arr2}).sum()``.
+
+        Returns
+        -------
+        ndarray or ExtensionArray
+
+        Examples
+        --------
+        >>> arr = pd.array([1, 2, pd.NA])
+        >>> arr._reduce_and_wrap("sum", kwargs={})
+        <IntegerArray>
+        [3]
+        Length: 1, dtype: Int64
+        """
+        result = self._reduce(name, skipna=skipna, **kwargs)
+        return np.array([result])
+
     # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
     # Incompatible types in assignment (expression has type "None", base class
     # "object" defined the type as "Callable[[object], int]")

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2124,6 +2124,10 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
     # ------------------------------------------------------------------
     # Reductions
 
+    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
+        result = self._reduce(name, skipna=skipna, **kwargs)
+        return type(self)([result], dtype=self.dtype)
+
     def min(self, *, skipna: bool = True, **kwargs):
         """
         The minimum value of the object.

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -34,6 +34,10 @@
     Shape,
     npt,
 )
+from pandas.compat import (
+    IS64,
+    is_platform_windows,
+)
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
 from pandas.util._validators import validate_fillna_kwargs
@@ -1088,13 +1092,22 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
 
         # median, skew, kurt, sem
         op = getattr(nanops, f"nan{name}")
-        result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
-
+        axis = kwargs.pop("axis", None)
+        result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs)
         if np.isnan(result):
             return libmissing.NA
 
         return result
 
+    def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs):
+        res = self._reduce(name=name, skipna=skipna, **kwargs)
+        if res is libmissing.NA:
+            return self._wrap_na_result(name=name, axis=0, mask_size=(1,))
+        else:
+            res = res.reshape(1)
+            mask = np.zeros(1, dtype=bool)
+            return self._maybe_mask_result(res, mask)
+
     def _wrap_reduction_result(self, name: str, result, *, skipna, axis):
         if isinstance(result, np.ndarray):
             if skipna:
@@ -1106,6 +1119,32 @@ def _wrap_reduction_result(self, name: str, result, *, skipna, axis):
             return self._maybe_mask_result(result, mask)
         return result
 
+    def _wrap_na_result(self, *, name, axis, mask_size):
+        mask = np.ones(mask_size, dtype=bool)
+
+        float_dtyp = "float32" if self.dtype == "Float32" else "float64"
+        if name in ["mean", "median", "var", "std", "skew"]:
+            np_dtype = float_dtyp
+        elif name in ["min", "max"] or self.dtype.itemsize == 8:
+            np_dtype = self.dtype.numpy_dtype.name
+        else:
+            is_windows_or_32bit = is_platform_windows() or not IS64
+            int_dtyp = "int32" if is_windows_or_32bit else "int64"
+            uint_dtyp = "uint32" if is_windows_or_32bit else "uint64"
+            np_dtype = {"b": int_dtyp, "i": int_dtyp, "u": uint_dtyp, "f": float_dtyp}[
+                self.dtype.kind
+            ]
+
+        value = np.array([1], dtype=np_dtype)
+        return self._maybe_mask_result(value, mask=mask)
+
+    def _wrap_min_count_reduction_result(
+        self, name: str, result, *, skipna, min_count, axis
+    ):
+        if min_count == 0 and isinstance(result, np.ndarray):
+            return self._maybe_mask_result(result, np.zeros(result.shape, dtype=bool))
+        return self._wrap_reduction_result(name, result, skipna=skipna, axis=axis)
+
     def sum(
         self,
         *,
@@ -1123,7 +1162,9 @@ def sum(
             min_count=min_count,
             axis=axis,
         )
-        return self._wrap_reduction_result("sum", result, skipna=skipna, axis=axis)
+        return self._wrap_min_count_reduction_result(
+            "sum", result, skipna=skipna, min_count=min_count, axis=axis
+        )
 
     def prod(
         self,
@@ -1134,14 +1175,17 @@ def prod(
         **kwargs,
     ):
         nv.validate_prod((), kwargs)
+
         result = masked_reductions.prod(
             self._data,
             self._mask,
             skipna=skipna,
             min_count=min_count,
             axis=axis,
         )
-        return self._wrap_reduction_result("prod", result, skipna=skipna, axis=axis)
+        return self._wrap_min_count_reduction_result(
+            "prod", result, skipna=skipna, min_count=min_count, axis=axis
+        )
 
     def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         nv.validate_mean((), kwargs)
@@ -1181,23 +1225,25 @@ def std(
 
     def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         nv.validate_min((), kwargs)
-        return masked_reductions.min(
+        result = masked_reductions.min(
             self._data,
             self._mask,
             skipna=skipna,
             axis=axis,
         )
+        return self._wrap_reduction_result("min", result, skipna=skipna, axis=axis)
 
     def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         nv.validate_max((), kwargs)
-        return masked_reductions.max(
+        result = masked_reductions.max(
             self._data,
             self._mask,
             skipna=skipna,
             axis=axis,
         )
+        return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis)
 
-    def any(self, *, skipna: bool = True, **kwargs):
+    def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         """
         Return whether any element is truthy.
 
@@ -1216,6 +1262,7 @@ def any(self, *, skipna: bool = True, **kwargs):
             If `skipna` is False, the result will still be True if there is
             at least one element that is truthy, otherwise NA will be returned
             if there are NA's present.
+        axis : int, optional, default 0
         **kwargs : any, default None
             Additional keywords have no effect but might be accepted for
             compatibility with NumPy.
@@ -1259,7 +1306,6 @@ def any(self, *, skipna: bool = True, **kwargs):
         >>> pd.array([0, 0, pd.NA]).any(skipna=False)
         <NA>
         """
-        kwargs.pop("axis", None)
         nv.validate_any((), kwargs)
 
         values = self._data.copy()
@@ -1278,7 +1324,7 @@ def any(self, *, skipna: bool = True, **kwargs):
             else:
                 return self.dtype.na_value
 
-    def all(self, *, skipna: bool = True, **kwargs):
+    def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         """
         Return whether all elements are truthy.
 
@@ -1297,6 +1343,7 @@ def all(self, *, skipna: bool = True, **kwargs):
             If `skipna` is False, the result will still be False if there is
             at least one element that is falsey, otherwise NA will be returned
             if there are NA's present.
+        axis : int, optional, default 0
         **kwargs : any, default None
             Additional keywords have no effect but might be accepted for
             compatibility with NumPy.
@@ -1340,7 +1387,6 @@ def all(self, *, skipna: bool = True, **kwargs):
         >>> pd.array([1, 0, pd.NA]).all(skipna=False)
         False
         """
-        kwargs.pop("axis", None)
         nv.validate_all((), kwargs)
 
         values = self._data.copy()
@@ -1350,7 +1396,7 @@ def all(self, *, skipna: bool = True, **kwargs):
         # bool, int, float, complex, str, bytes,
         # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
         np.putmask(values, self._mask, self._truthy_value)  # type: ignore[arg-type]
-        result = values.all()
+        result = values.all(axis=axis)
 
         if skipna:
             return result

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -10852,7 +10852,7 @@ def blk_func(values, axis: Axis = 1):
                     self._mgr, ArrayManager
                 ):
                     return values._reduce(name, axis=1, skipna=skipna, **kwds)
-                return values._reduce(name, skipna=skipna, **kwds)
+                return values._reduce_and_wrap(name, skipna=skipna, kwargs=kwds)
             else:
                 return op(values, axis=axis, skipna=skipna, **kwds)
 
@@ -10897,7 +10897,7 @@ def _get_data() -> DataFrame:
             out = out.astype(out_dtype)
         elif (df._mgr.get_dtypes() == object).any():
             out = out.astype(object)
-        elif len(self) == 0 and name in ("sum", "prod"):
+        elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"):
             # Even if we are object dtype, follow numpy and return
             #  float64, see test_apply_funcs_over_empty
             out = out.astype(np.float64)
@@ -11158,10 +11158,9 @@ def idxmin(
         )
         indices = res._values
 
-        # indices will always be np.ndarray since axis is not None and
+        # indices will always be 1d array since axis is not None and
         # values is a 2d array for DataFrame
-        # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
-        assert isinstance(indices, np.ndarray)  # for mypy
+        # indices will always be np.ndarray since axis is not N
 
         index = data._get_axis(axis)
         result = [index[i] if i >= 0 else np.nan for i in indices]
@@ -11188,10 +11187,9 @@ def idxmax(
         )
         indices = res._values
 
-        # indices will always be np.ndarray since axis is not None and
+        # indices will always be 1d array since axis is not None and
         # values is a 2d array for DataFrame
-        # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
-        assert isinstance(indices, np.ndarray)  # for mypy
+        assert isinstance(indices, (np.ndarray, ExtensionArray))  # for mypy
 
         index = data._get_axis(axis)
         result = [index[i] if i >= 0 else np.nan for i in indices]