Skip to content

Commit

Permalink
BUG: DataFrame reductions dtypes on object input (#51335)
Browse files Browse the repository at this point in the history
* BUG: DataFrame reductions dtypes

* whatsnew

* dtype fixup; whatsnew

* Add test, fix whatsnew

* Add datetime test

* result_dtype.type

* xfail

* type-ignore
  • Loading branch information
rhshadrach authored Feb 18, 2023
1 parent 1beec62 commit b836a88
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 72 deletions.
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,7 @@ Other API changes
- The levels of the index of the :class:`Series` returned from ``Series.sparse.from_coo`` now always have dtype ``int32``. Previously they had dtype ``int64`` (:issue:`50926`)
- :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`)
- The methods :meth:`Series.round`, :meth:`DataFrame.__invert__`, :meth:`Series.__invert__`, :meth:`DataFrame.swapaxes`, :meth:`DataFrame.first`, :meth:`DataFrame.last`, :meth:`Series.first`, :meth:`Series.last` and :meth:`DataFrame.align` will now always return new objects (:issue:`51032`)
- :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`)
- :class:`DataFrame` and :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`)
- Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`)
-

Expand Down Expand Up @@ -1226,11 +1226,11 @@ Numeric
^^^^^^^
- Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`)
- Bug in arithmetic operations on :class:`Series` not propagating mask when combining masked dtypes and numpy dtypes (:issue:`45810`, :issue:`42630`)
- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`)
- Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`)
- Bug in :meth:`Series.__add__` casting to object for list and masked :class:`Series` (:issue:`22962`)
- Bug in :meth:`~arrays.ArrowExtensionArray.mode` where ``dropna=False`` was not respected when there was ``NA`` values (:issue:`50982`)
- Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`)
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with tz-aware data containing ``pd.NaT`` and ``axis=1`` would return incorrect results (:issue:`51242`)

Conversion
^^^^^^^^^^
Expand Down
8 changes: 8 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,14 @@ def ordered(request):
return request.param


@pytest.fixture(params=[True, False])
def skipna(request):
"""
Boolean 'skipna' parameter.
"""
return request.param


@pytest.fixture(params=["first", "last", False])
def keep(request):
"""
Expand Down
83 changes: 36 additions & 47 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@
is_integer_dtype,
is_iterator,
is_list_like,
is_object_dtype,
is_scalar,
is_sequence,
needs_i8_conversion,
Expand Down Expand Up @@ -10458,54 +10457,44 @@ def _get_data() -> DataFrame:
data = self._get_bool_data()
return data

if numeric_only or axis == 0:
# For numeric_only non-None and axis non-None, we know
# which blocks to use and no try/except is needed.
# For numeric_only=None only the case with axis==0 and no object
# dtypes are unambiguous can be handled with BlockManager.reduce
# Case with EAs see GH#35881
df = self
if numeric_only:
df = _get_data()
if axis == 1:
df = df.T
axis = 0

# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager.reduce
res = df._mgr.reduce(blk_func)
out = df._constructor(res).iloc[0]
if out_dtype is not None:
out = out.astype(out_dtype)
if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)

return out

assert not numeric_only and axis in (1, None)

data = self
values = data.values
result = func(values)

if hasattr(result, "dtype"):
if filter_type == "bool" and notna(result).all():
result = result.astype(np.bool_)
elif filter_type is None and is_object_dtype(result.dtype):
try:
result = result.astype(np.float64)
except (ValueError, TypeError):
# try to coerce to the original dtypes item by item if we can
pass

# Case with EAs see GH#35881
df = self
if numeric_only:
df = _get_data()
if axis is None:
return result
return func(df.values)
elif axis == 1:
if len(df.index) == 0:
# Taking a transpose would result in no columns, losing the dtype.
# In the empty case, reducing along axis 0 or 1 gives the same
# result dtype, so reduce with axis=0 and ignore values
result = df._reduce(
op,
name,
axis=0,
skipna=skipna,
numeric_only=False,
filter_type=filter_type,
**kwds,
).iloc[:0]
result.index = df.index
return result
df = df.T

# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager.reduce
res = df._mgr.reduce(blk_func)
out = df._constructor(res).iloc[0]
if out_dtype is not None:
out = out.astype(out_dtype)
elif (df._mgr.get_dtypes() == object).any():
out = out.astype(object)
elif len(self) == 0 and name in ("sum", "prod"):
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)

labels = self._get_agg_axis(axis)
result = self._constructor_sliced(result, index=labels)
return result
return out

def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
"""
Expand Down
12 changes: 4 additions & 8 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,14 +984,10 @@ def reduce(self: T, func: Callable) -> T:
# TODO NaT doesn't preserve dtype, so we need to ensure to create
# a timedelta result array if original was timedelta
# what if datetime results in timedelta? (eg std)
if res is NaT and is_timedelta64_ns_dtype(arr.dtype):
result_arrays.append(np.array(["NaT"], dtype="timedelta64[ns]"))
else:
# error: Argument 1 to "append" of "list" has incompatible type
# "ExtensionArray"; expected "ndarray"
result_arrays.append(
sanitize_array([res], None) # type: ignore[arg-type]
)
dtype = arr.dtype if res is NaT else None
result_arrays.append(
sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type]
)

index = Index._simple_new(np.array([None], dtype=object)) # placeholder
columns = self.items
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1535,7 +1535,12 @@ def _maybe_null_out(
result[null_mask] = None
elif result is not NaT:
if check_below_min_count(shape, mask, min_count):
result = np.nan
result_dtype = getattr(result, "dtype", None)
if is_float_dtype(result_dtype):
# error: Item "None" of "Optional[Any]" has no attribute "type"
result = result_dtype.type("nan") # type: ignore[union-attr]
else:
result = np.nan

return result

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ def test_apply_funcs_over_empty(func):

result = df.apply(getattr(np, func))
expected = getattr(df, func)()
if func in ("sum", "prod"):
expected = expected.astype(float)
tm.assert_series_equal(result, expected)


Expand Down
74 changes: 68 additions & 6 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,11 +317,11 @@ def wrapper(x):
DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object),
],
)
def test_stat_operators_attempt_obj_array(self, method, df):
def test_stat_operators_attempt_obj_array(self, method, df, axis):
# GH#676
assert df.values.dtype == np.object_
result = getattr(df, method)(1)
expected = getattr(df.astype("f8"), method)(1)
result = getattr(df, method)(axis=axis)
expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
Expand Down Expand Up @@ -424,7 +424,7 @@ def test_mean_mixed_string_decimal(self):
with pytest.raises(TypeError, match="unsupported operand type"):
df.mean()
result = df[["A", "C"]].mean()
expected = Series([2.7, 681.6], index=["A", "C"])
expected = Series([2.7, 681.6], index=["A", "C"], dtype=object)
tm.assert_series_equal(result, expected)

def test_var_std(self, datetime_frame):
Expand Down Expand Up @@ -687,6 +687,29 @@ def test_std_timedelta64_skipna_false(self):
expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]]
)
def test_std_datetime64_with_nat(
self, values, skipna, using_array_manager, request
):
# GH#51335
if using_array_manager and (
not skipna or all(value is pd.NaT for value in values)
):
mark = pytest.mark.xfail(
reason="GH#51446: Incorrect type inference on NaT in reduction result"
)
request.node.add_marker(mark)
df = DataFrame({"a": to_datetime(values)})
result = df.std(skipna=skipna)
if not skipna or all(value is pd.NaT for value in values):
expected = Series({"a": pd.NaT}, dtype="timedelta64[ns]")
else:
# 86400000000000ns == 1 day
expected = Series({"a": 86400000000000}, dtype="timedelta64[ns]")
tm.assert_series_equal(result, expected)

def test_sum_corner(self):
empty_frame = DataFrame()

Expand All @@ -697,6 +720,29 @@ def test_sum_corner(self):
assert len(axis0) == 0
assert len(axis1) == 0

@pytest.mark.parametrize(
"index",
[
tm.makeRangeIndex(0),
tm.makeDateIndex(0),
tm.makeNumericIndex(0, dtype=int),
tm.makeNumericIndex(0, dtype=float),
tm.makeDateIndex(0, freq="M"),
tm.makePeriodIndex(0),
],
)
def test_axis_1_empty(self, all_reductions, index, using_array_manager):
df = DataFrame(columns=["a"], index=index)
result = getattr(df, all_reductions)(axis=1)
if all_reductions in ("any", "all"):
expected_dtype = "bool"
elif all_reductions == "count":
expected_dtype = "int64"
else:
expected_dtype = "object"
expected = Series([], index=index, dtype=expected_dtype)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
@pytest.mark.parametrize("numeric_only", [None, True, False])
def test_sum_prod_nanops(self, method, unit, numeric_only):
Expand Down Expand Up @@ -1418,6 +1464,21 @@ def test_preserve_timezone(self, initial: str, method):
result = getattr(df, method)(axis=1)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("method", ["min", "max"])
def test_minmax_tzaware_skipna_axis_1(self, method, skipna):
# GH#51242
val = to_datetime("1900-01-01", utc=True)
df = DataFrame(
{"a": Series([pd.NaT, pd.NaT, val]), "b": Series([pd.NaT, val, val])}
)
op = getattr(df, method)
result = op(axis=1, skipna=skipna)
if skipna:
expected = Series([pd.NaT, val, val])
else:
expected = Series([pd.NaT, pd.NaT, val])
tm.assert_series_equal(result, expected)

def test_frame_any_with_timedelta(self):
# GH#17667
df = DataFrame(
Expand Down Expand Up @@ -1609,12 +1670,13 @@ def test_prod_sum_min_count_mixed_object():


@pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
def test_reduction_axis_none_returns_scalar(method):
@pytest.mark.parametrize("numeric_only", [True, False])
def test_reduction_axis_none_returns_scalar(method, numeric_only):
# GH#21597 As of 2.0, axis=None reduces over all axes.

df = DataFrame(np.random.randn(4, 4))

result = getattr(df, method)(axis=None)
result = getattr(df, method)(axis=None, numeric_only=numeric_only)
np_arr = df.to_numpy()
if method in {"skew", "kurt"}:
comp_mod = pytest.importorskip("scipy.stats")
Expand Down
8 changes: 0 additions & 8 deletions pandas/tests/test_nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,6 @@
use_bn = nanops._USE_BOTTLENECK


@pytest.fixture(params=[True, False])
def skipna(request):
"""
Fixture to pass skipna to nanops functions.
"""
return request.param


@pytest.fixture
def disable_bottleneck(monkeypatch):
with monkeypatch.context() as m:
Expand Down

0 comments on commit b836a88

Please sign in to comment.