From 1eec9652e6953e9e4880a314fa6c9fbe0ac6650f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 2 Dec 2019 06:43:26 -0600 Subject: [PATCH 01/38] API: Uses pd.NA in IntegerArray --- pandas/core/arrays/integer.py | 24 +++++++++++++++--------- pandas/tests/arrays/test_integer.py | 19 +++++++++++++------ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 12b76df9a5983..6767ca9398b0f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -4,7 +4,7 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import lib, missing as libmissing from pandas.compat import set_function_name from pandas.util._decorators import cache_readonly @@ -43,7 +43,7 @@ class _IntegerDtype(ExtensionDtype): name: str base = None type: Type - na_value = np.nan + na_value = libmissing.NA def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" @@ -377,14 +377,19 @@ def __getitem__(self, item): return self._data[item] return type(self)(self._data[item], self._mask[item]) - def _coerce_to_ndarray(self): + def _coerce_to_ndarray(self, dtype=None): """ coerce to an ndarary of object dtype """ - # TODO(jreback) make this better data = self._data.astype(object) - data[self._mask] = self._na_value + + if dtype is not None and is_float_dtype(dtype): + na_value = np.nan + else: + na_value = self._na_value + + data[self._mask] = na_value return data __array_priority__ = 1000 # higher than ndarray so ops dispatch to us @@ -394,7 +399,7 @@ def __array__(self, dtype=None): the array interface, return my values We return an object array here to preserve our scalar values """ - return self._coerce_to_ndarray() + return self._coerce_to_ndarray(dtype=dtype) def __arrow_array__(self, type=None): """ @@ -510,7 +515,7 @@ def isna(self): @property def _na_value(self): - return np.nan + return self.dtype.na_value @classmethod def _concat_same_type(cls, to_concat): @@ -549,7 +554,7 @@ def astype(self, dtype, copy=True): return type(self)(result, mask=self._mask, copy=False) # coerce - data = self._coerce_to_ndarray() + data = self._coerce_to_ndarray(dtype=dtype) return astype_nansafe(data, dtype, copy=None) @property @@ -673,7 +678,8 @@ def _reduce(self, name, skipna=True, **kwargs): # coerce to a nan-aware float if needed if mask.any(): data = self._data.astype("float64") - data[mask] = self._na_value + # We explicitly use NaN within reductions. + data[mask] = np.nan op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 443a0c7e71616..d8ca06d83a50d 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -108,13 +108,17 @@ def test_repr_array_long(): class TestConstructors: + def test_uses_pandas_na(self): + a = pd.array([1, None], dtype=pd.Int64Dtype()) + assert a[1] is pd.NA + def test_from_dtype_from_float(self, data): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(np.array(data).astype("float"), dtype=str(dtype)) + result = pd.Series(np.array(data, dtype="float"), dtype=str(dtype)) tm.assert_series_equal(result, expected) # from int / list @@ -464,7 +468,8 @@ def test_astype(self, all_data): # coerce to same numpy_dtype - mixed s = pd.Series(mixed) - with pytest.raises(ValueError): + with pytest.raises(TypeError): + # XXX: Should this be TypeError or ValueError? s.astype(all_data.dtype.numpy_dtype) # coerce to object @@ -507,7 +512,7 @@ def test_frame_repr(data_missing): df = pd.DataFrame({"A": data_missing}) result = repr(df) - expected = " A\n0 NaN\n1 1" + expected = " A\n0 NA\n1 1" assert result == expected @@ -523,7 +528,7 @@ def test_conversions(data_missing): # we assert that we are exactly equal # including type conversions of scalars result = df["A"].astype("object").values - expected = np.array([np.nan, 1], dtype=object) + expected = np.array([pd.NA, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) for r, e in zip(result, expected): @@ -750,9 +755,11 @@ def test_reduce_to_float(op): def test_astype_nansafe(): # see gh-22343 arr = integer_array([np.nan, 1, 2], dtype="Int8") - msg = "cannot convert float NaN to integer" + # XXX: determine the proper exception here, from int(NA). + # msg = "cannot convert float NaN to integer" + msg = "" - with pytest.raises(ValueError, match=msg): + with pytest.raises(TypeError, match=msg): arr.astype("uint32") From c5695629cf6cbbdffee7d078b4688f5ceb9e8be1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 2 Dec 2019 16:09:43 -0600 Subject: [PATCH 02/38] wip --- pandas/core/arrays/integer.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 6767ca9398b0f..124f9166a7e2b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,5 +1,5 @@ import numbers -from typing import Type +from typing import Any, Tuple, Type import warnings import numpy as np @@ -377,18 +377,19 @@ def __getitem__(self, item): return self._data[item] return type(self)(self._data[item], self._mask[item]) - def _coerce_to_ndarray(self, dtype=None): + def _coerce_to_ndarray(self, dtype=None, na_value=libmissing.NA): """ coerce to an ndarary of object dtype """ # TODO(jreback) make this better - data = self._data.astype(object) - - if dtype is not None and is_float_dtype(dtype): + if dtype is None: + dtype = object + elif is_float_dtype(dtype) and na_value is libmissing.NA: + # XXX: Do we want to implicitly treat NA as NaN here? + # We should be deliberate in this decision. na_value = np.nan - else: - na_value = self._na_value + data = self._data.astype(dtype) data[self._mask] = na_value return data @@ -615,6 +616,9 @@ def value_counts(self, dropna=True): return Series(array, index=index) + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + return self._coerce_to_ndarray(na_value=np.nan), np.nan + def _values_for_argsort(self) -> np.ndarray: """Return values for sorting. From a8261a48dffae253b930582deee9b331a224495f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 3 Dec 2019 07:46:09 -0600 Subject: [PATCH 03/38] wip --- pandas/core/arrays/integer.py | 10 +++++++++- pandas/tests/extension/test_integer.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 124f9166a7e2b..b0e6d4daa2a7e 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -390,7 +390,15 @@ def _coerce_to_ndarray(self, dtype=None, na_value=libmissing.NA): na_value = np.nan data = self._data.astype(dtype) - data[self._mask] = na_value + + if ( + is_integer_dtype(dtype) + and na_value is libmissing.NA + and not self._mask.any() + ): + return data + else: + data[self._mask] = na_value return data __array_priority__ = 1000 # higher than ndarray so ops dispatch to us diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index d051345fdd12d..5ad060bb63716 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -129,7 +129,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): expected = s.combine(other, op) if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.astype(float) + expected = expected.fillna(np.nan).astype(float) if op_name == "__rtruediv__": # TODO reverse operators result in object dtype result = result.astype(float) From cddc9df852bd8b62afa8f319fbf90f45e5438a0e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 3 Dec 2019 09:14:11 -0600 Subject: [PATCH 04/38] fixup value counts --- pandas/core/arrays/integer.py | 4 +++- pandas/tests/extension/test_integer.py | 20 +++++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index b0e6d4daa2a7e..22a6cdaef1d78 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -618,7 +618,9 @@ def value_counts(self, dropna=True): # w/o passing the dtype array = np.append(array, [self._mask.sum()]) index = Index( - np.concatenate([index.values, np.array([np.nan], dtype=object)]), + np.concatenate( + [index.values, np.array([self.dtype.na_value], dtype=object)] + ), dtype=object, ) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 5ad060bb63716..7900649675610 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -34,7 +34,7 @@ def make_data(): - return list(range(1, 9)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] + return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] @pytest.fixture( @@ -65,7 +65,7 @@ def data_for_twos(dtype): @pytest.fixture def data_missing(dtype): - return integer_array([np.nan, 1], dtype=dtype) + return integer_array([pd.NA, 1], dtype=dtype) @pytest.fixture @@ -75,18 +75,18 @@ def data_for_sorting(dtype): @pytest.fixture def data_missing_for_sorting(dtype): - return integer_array([1, np.nan, 0], dtype=dtype) + return integer_array([1, pd.NA, 0], dtype=dtype) @pytest.fixture def na_cmp(): - # we are np.nan - return lambda x, y: np.isnan(x) and np.isnan(y) + # we are pd.NA + return lambda x, y: x is pd.NA and y is pd.NA @pytest.fixture def na_value(): - return np.nan + return pd.NA @pytest.fixture @@ -94,7 +94,7 @@ def data_for_grouping(dtype): b = 1 a = 0 c = 2 - na = np.nan + na = pd.NA return integer_array([b, b, na, na, a, a, b, c], dtype=dtype) @@ -142,6 +142,11 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass + if op_name == "__rpow__": + # TODO: https://github.com/pandas-dev/pandas/issues/29997 + # pow(1, NA) is NA or 1? + pytest.skip("TODO-29997") + if (op_name == "__rpow__") and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA # see https://github.com/pandas-dev/pandas/issues/22022 @@ -163,6 +168,7 @@ def test_error(self, data, all_arithmetic_operators): class TestComparisonOps(base.BaseComparisonOpsTests): def check_opname(self, s, op_name, other, exc=None): + pytest.skip(msg="TODO: NA comparisions") super().check_opname(s, op_name, other, exc=None) def _compare_other(self, s, data, op_name, other): From 9488d3407ef43a1c5b65ac207b38b425dc188d7f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 3 Dec 2019 10:57:34 -0600 Subject: [PATCH 05/38] fixed to_numpy --- pandas/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index d515a015cdbec..346fe3081801a 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1352,7 +1352,7 @@ def test_array_multiindex_raises(): ), ( pd.core.arrays.integer_array([0, np.nan]), - np.array([0, np.nan], dtype=object), + np.array([0, pd.NA], dtype=object), ), ( pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), From 0d5aab841bdc6c3954656200e50b3996e413aed4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 3 Dec 2019 11:04:25 -0600 Subject: [PATCH 06/38] doc --- doc/source/user_guide/integer_na.rst | 25 ++++++++++++++++++++++++- pandas/core/arrays/integer.py | 11 +++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 77568f3bcb244..f08cbcf0dc698 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -15,7 +15,6 @@ Nullable integer data type IntegerArray is currently experimental. Its API or implementation may change without warning. - In :ref:`missing_data`, we saw that pandas primarily uses ``NaN`` to represent missing data. Because ``NaN`` is a float, this forces an array of integers with any missing values to become floating point. In some cases, this may not matter @@ -23,6 +22,9 @@ much. But if your integer column is, say, an identifier, casting to float can be problematic. Some integers cannot even be represented as floating point numbers. +Construction +------------ + Pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` implemented within pandas. @@ -39,6 +41,12 @@ NumPy's ``'int64'`` dtype: pd.array([1, 2, np.nan], dtype="Int64") +All NA-like values are replaced with :attr:`pandas.NA`. + +.. ipython:: python + + pd.array([1, 2, np.nan, None, pd.NA], dtype="Int64") + This array can be stored in a :class:`DataFrame` or :class:`Series` like any NumPy array. @@ -78,6 +86,9 @@ with the dtype. In the future, we may provide an option for :class:`Series` to infer a nullable-integer dtype. +Operations +---------- + Operations involving an integer array will behave similar to NumPy arrays. Missing values will be propagated, and the data will be coerced to another dtype if needed. @@ -123,3 +134,15 @@ Reduction and groupby operations such as 'sum' work as well. df.sum() df.groupby('B').A.sum() + +Scalar NA Value +--------------- + +:class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar +missing value. Slicing a single element that's missing will return +:attr:`pandas.NA` + +.. ipython:: python + + a = pd.array([1, None], dtype="Int64") + a[1] diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 22a6cdaef1d78..58d9d0c33908a 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -267,6 +267,11 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin): .. versionadded:: 0.24.0 + .. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as its missing value, rather + than :attr:`numpy.nan`. + .. warning:: IntegerArray is currently experimental, and its API or internal @@ -804,6 +809,12 @@ def integer_arithmetic_method(self, other): _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as its missing value, + rather than :attr:`numpy.nan`. + + Attributes ---------- None From fa61a6dfdff568ab148663f375f313b29aaf020e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 3 Dec 2019 13:02:06 -0600 Subject: [PATCH 07/38] wip --- pandas/core/arrays/boolean.py | 5 ++++- pandas/core/arrays/integer.py | 9 ++++----- pandas/tests/arrays/test_integer.py | 20 +++++++++++++++----- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index c118b6fe26549..ea2147f91a0f8 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -554,8 +554,11 @@ def _values_for_argsort(self) -> np.ndarray: @classmethod def _create_logical_method(cls, op): def logical_method(self, other): + from pandas.arrays import IntegerArray - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + if isinstance( + other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) + ): # Rely on pandas to unbox and dispatch to us. return NotImplemented diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 58d9d0c33908a..05c6398398582 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -653,13 +653,13 @@ def _values_for_argsort(self) -> np.ndarray: @classmethod def _create_comparison_method(cls, op): - op_name = op.__name__ - @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): + from pandas.arrays import BooleanArray + mask = None - if isinstance(other, IntegerArray): + if isinstance(other, (BooleanArray, IntegerArray)): other, mask = other._data, other._mask elif is_list_like(other): @@ -684,8 +684,7 @@ def cmp_method(self, other): else: mask = self._mask | mask - result[mask] = op_name == "ne" - return result + return BooleanArray(result, mask) name = "__{name}__".format(name=op.__name__) return set_function_name(cmp_method, name, cls) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index d8ca06d83a50d..a26e79d43a90d 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -365,10 +365,10 @@ def _compare_other(self, data, op_name, other): # array result = pd.Series(op(data, other)) - expected = pd.Series(op(data._data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA tm.assert_series_equal(result, expected) @@ -376,11 +376,10 @@ def _compare_other(self, data, op_name, other): s = pd.Series(data) result = op(s, other) - expected = pd.Series(data._data) - expected = op(expected, other) + expected = op(pd.Series(data._data), other) # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA tm.assert_series_equal(result, expected) @@ -393,6 +392,17 @@ def test_compare_array(self, data, all_compare_operators): other = pd.Series([0] * len(data)) self._compare_other(data, op_name, other) + def test_compare_boolean_array(self): + left = pd.array([0, 1, None, None], dtype="Int64") + right = pd.array([True, True, False, None], dtype="boolean") + expected = pd.array([False, True, None, None], dtype="boolean") + + result = left == right + tm.assert_extension_array_equal(result, expected) + + result = right == left + tm.assert_extension_array_equal(result, expected) + class TestCasting: @pytest.mark.parametrize("dropna", [True, False]) From de2c6c6b789defbdbfaaa817aae8bb449b409276 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 3 Dec 2019 13:21:42 -0600 Subject: [PATCH 08/38] wip --- pandas/core/arrays/boolean.py | 11 +++++------ pandas/tests/arrays/test_integer.py | 3 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index ea2147f91a0f8..1e7cb479f02f9 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -554,11 +554,7 @@ def _values_for_argsort(self) -> np.ndarray: @classmethod def _create_logical_method(cls, op): def logical_method(self, other): - from pandas.arrays import IntegerArray - - if isinstance( - other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) - ): + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented @@ -600,8 +596,11 @@ def _create_comparison_method(cls, op): op_name = op.__name__ def cmp_method(self, other): + from pandas.arrays import IntegerArray - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + if isinstance( + other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) + ): # Rely on pandas to unbox and dispatch to us. return NotImplemented diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index a26e79d43a90d..430e4c049a7e7 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -225,7 +225,7 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): original = expected expected = expected.astype(s.dtype) - expected[mask] = np.nan + expected[mask] = pd.NA # assert that the expected astype is ok # (skip for unsigned as they have wrap around) @@ -380,6 +380,7 @@ def _compare_other(self, data, op_name, other): # fill the nan locations expected[data._mask] = pd.NA + expected = expected.astype("boolean") tm.assert_series_equal(result, expected) From 60d7663c3c2c5b2819445ca368647fe69c36f0da Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 3 Dec 2019 13:46:37 -0600 Subject: [PATCH 09/38] wip --- pandas/core/arrays/boolean.py | 11 +++++++++++ pandas/tests/arrays/test_integer.py | 21 +++++++++++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 1e7cb479f02f9..15fc1f7619548 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -6,6 +6,7 @@ from pandas._libs import lib from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe @@ -665,6 +666,16 @@ def _reduce(self, name, skipna=True, **kwargs): return result + def any(self, axis=None, out=None, keepdims=False, skipna=True): + # Note: needed to implement for + # pandas/tests/arrays/test_integer.py::test_preserve_dtypes[sum] + nv.validate_any((), dict(out=out, keepdims=keepdims)) + return self._reduce("any", skipna=skipna) + + def all(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_any((), dict(out=out, keepdims=keepdims)) + return self._reduce("all", skipna=skipna) + def _maybe_mask_result(self, result, mask, other, op_name): """ Parameters diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 430e4c049a7e7..33ab32a8486a5 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -189,6 +189,10 @@ def _check_op_float(self, result, expected, mask, s, op_name, other): # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) mask2 = np.isinf(expected) & np.isnan(result) expected[mask2] = np.nan + if op_name == "__pow__": + # https://github.com/pandas-dev/pandas/issues/29997 + # unclear what 1 ** NA is. + pytest.skip(msg="GH-29997") tm.assert_series_equal(result, expected) def _check_op_integer(self, result, expected, mask, s, op_name, other): @@ -212,16 +216,23 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): else: expected = expected.fillna(0) else: - expected[(s.values == 0) & ((expected == 0) | expected.isna())] = 0 + expected[ + (s.values == 0).fillna(False) + & ((expected == 0).fillna(False) | expected.isna()) + ] = 0 try: - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) except ValueError: expected = expected.astype(float) - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) @@ -343,6 +354,7 @@ def test_error(self, data, all_arithmetic_operators): with pytest.raises(NotImplementedError): opa(np.arange(len(s)).reshape(-1, len(s))) + @pytest.mark.xfail(reason="GH-29997") def test_pow(self): # https://github.com/pandas-dev/pandas/issues/22022 a = integer_array([1, np.nan, np.nan, 1]) @@ -353,6 +365,7 @@ def test_pow(self): def test_rpow_one_to_na(self): # https://github.com/pandas-dev/pandas/issues/22022 + # https://github.com/pandas-dev/pandas/issues/29997 arr = integer_array([np.nan, np.nan]) result = np.array([1.0, 2.0]) ** arr expected = np.array([1.0, np.nan]) @@ -702,7 +715,7 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) result = (df.A + df.C) * 3 == 12 - expected = pd.Series([False, True, False]) + expected = pd.Series([False, True, None], dtype="boolean") tm.assert_series_equal(result, expected) result = df.A + df.B From a4c4618cadbbd975d25e4dbc16a51afa8154967b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 3 Dec 2019 14:00:04 -0600 Subject: [PATCH 10/38] fixup extension --- pandas/tests/extension/test_integer.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 7900649675610..8d3c94a3be937 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -142,7 +142,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass - if op_name == "__rpow__": + if op_name in {"__pow__", "__rpow__"}: # TODO: https://github.com/pandas-dev/pandas/issues/29997 # pow(1, NA) is NA or 1? pytest.skip("TODO-29997") @@ -168,12 +168,20 @@ def test_error(self, data, all_arithmetic_operators): class TestComparisonOps(base.BaseComparisonOpsTests): def check_opname(self, s, op_name, other, exc=None): - pytest.skip(msg="TODO: NA comparisions") super().check_opname(s, op_name, other, exc=None) def _compare_other(self, s, data, op_name, other): self.check_opname(s, op_name, other) + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + result = op(s, other) + expected = s.combine(other, op).astype("boolean") + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + class TestInterface(base.BaseInterfaceTests): pass From 1c716f3c59db6094d185dcb1f9600d324c78f98a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 4 Dec 2019 07:49:17 -0600 Subject: [PATCH 11/38] update tests --- pandas/tests/arrays/test_integer.py | 46 ++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 33ab32a8486a5..1ddee22b4313a 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -397,14 +397,44 @@ def _compare_other(self, data, op_name, other): tm.assert_series_equal(result, expected) - def test_compare_scalar(self, data, all_compare_operators): - op_name = all_compare_operators - self._compare_other(data, op_name, 0) - - def test_compare_array(self, data, all_compare_operators): - op_name = all_compare_operators - other = pd.Series([0] * len(data)) - self._compare_other(data, op_name, other) + @pytest.mark.parametrize("other", [True, False, pd.NA]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([1, 0, None], dtype="Int64") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="Int64") + else: + values = op(a._data, other) + expected = pd.arrays.BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64")) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, 2, None, None, None], dtype="Int64") + b = pd.array([0, 1, None, 0, 1, None], dtype="Int64") + + result = op(a, b) + values = op(a._data, b._data) + mask = a._mask | b._mask + + expected = pd.arrays.BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal( + a, pd.array([0, 1, 2, None, None, None], dtype="Int64") + ) + tm.assert_extension_array_equal( + b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") + ) def test_compare_boolean_array(self): left = pd.array([0, 1, None, None], dtype="Int64") From 34de18e76616513b0499867fb556f13d95f43a2b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 4 Dec 2019 14:59:37 -0600 Subject: [PATCH 12/38] updates --- pandas/core/arrays/integer.py | 26 ++++++++++++++++---------- pandas/tests/arrays/test_integer.py | 16 ++++++++++++++-- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 05c6398398582..818168a0a9c6c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -671,16 +671,22 @@ def cmp_method(self, other): if len(self) != len(other): raise ValueError("Lengths must match to compare") - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - result = op(self._data, other) + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + result = op(self._data, other) # nans propagate if mask is None: - mask = self._mask + mask = self._mask.copy() else: mask = self._mask | mask @@ -747,6 +753,7 @@ def _create_arithmetic_method(cls, op): @unpack_zerodim_and_defer(op.__name__) def integer_arithmetic_method(self, other): + # nans propagate mask = None @@ -771,15 +778,14 @@ def integer_arithmetic_method(self, other): if not (is_float(other) or is_integer(other)): raise TypeError("can only perform ops with numeric values") - # nans propagate if mask is None: - mask = self._mask + mask = self._mask.copy() else: mask = self._mask | mask # 1 ** np.nan is 1. So we have to unmask those. if op_name == "pow": - mask = np.where(self == 1, False, mask) + mask = np.where(self._data == 1, False, mask) elif op_name == "rpow": mask = np.where(other == 1, False, mask) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 1ddee22b4313a..ce3197ce071fc 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -160,7 +160,7 @@ def _check_op(self, s, op_name, other, exc=None): # 1 ** na is na, so need to unmask those if op_name == "__pow__": - mask = np.where(s == 1, False, mask) + mask = np.where(s.to_numpy() == 1, False, mask) elif op_name == "__rpow__": mask = np.where(other == 1, False, mask) @@ -265,11 +265,16 @@ def test_arith_integer_array(self, data, all_arithmetic_operators): rhs = pd.Series([1] * len(data), dtype=data.dtype) rhs.iloc[-1] = np.nan + if op in {"__pow__", "__rpow__"}: + pytest.skip("TODO") + self._check_op(s, op, rhs) def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # scalar op = all_arithmetic_operators + if op in {"__pow__", "__rpow__"}: + pytest.skip("TODO") s = pd.Series(data) self._check_op(s, op, 1, exc=TypeError) @@ -277,6 +282,8 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op = all_arithmetic_operators + if op in {"__pow__", "__rpow__"}: + pytest.skip("TODO") df = pd.DataFrame({"A": data}) self._check_op(df, op, 1, exc=TypeError) @@ -284,6 +291,8 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op = all_arithmetic_operators + if op in {"__pow__", "__rpow__"}: + pytest.skip("TODO") s = pd.Series(data) other = np.ones(len(s), dtype=s.dtype.type) @@ -292,6 +301,9 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): def test_arith_coerce_scalar(self, data, all_arithmetic_operators): op = all_arithmetic_operators + if op in {"__pow__", "__rpow__"}: + pytest.skip("TODO") + s = pd.Series(data) other = 0.01 @@ -405,7 +417,7 @@ def test_scalar(self, other, all_compare_operators): result = op(a, other) if other is pd.NA: - expected = pd.array([None, None, None], dtype="Int64") + expected = pd.array([None, None, None], dtype="boolean") else: values = op(a._data, other) expected = pd.arrays.BooleanArray(values, a._mask, copy=True) From ffbe2998f96a17b3432f05151864294b86196723 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 5 Dec 2019 15:48:15 -0600 Subject: [PATCH 13/38] wip --- pandas/core/arrays/integer.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 818168a0a9c6c..32d3b1db732e1 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -394,16 +394,16 @@ def _coerce_to_ndarray(self, dtype=None, na_value=libmissing.NA): # We should be deliberate in this decision. na_value = np.nan - data = self._data.astype(dtype) + if is_integer_dtype(dtype): + if not self.isna().any(): + return self._data + else: + raise ValueError( + "cannot convert to integer NumPy array with missing values" + ) - if ( - is_integer_dtype(dtype) - and na_value is libmissing.NA - and not self._mask.any() - ): - return data - else: - data[self._mask] = na_value + data = self._data.astype(dtype) + data[self._mask] = na_value return data __array_priority__ = 1000 # higher than ndarray so ops dispatch to us From 7abf40e53796ce52da0b974de2a59690bf3492ef Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 5 Dec 2019 16:14:31 -0600 Subject: [PATCH 14/38] API: Handle pow & rpow special cases Closes https://github.com/pandas-dev/pandas/issues/29997 --- doc/source/reference/arrays.rst | 30 ++++++++++++++++++++++++--- pandas/_libs/missing.pyx | 25 ++++++++++++++++++++-- pandas/tests/scalar/test_na_scalar.py | 24 ++++++++++++++++++++- 3 files changed, 73 insertions(+), 6 deletions(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index cf14d28772f4c..563d6f5bab833 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -2,9 +2,9 @@ .. _api.arrays: -============= -Pandas arrays -============= +========================= +Pandas arrays and scalars +========================= .. currentmodule:: pandas @@ -28,6 +28,30 @@ Strings :class:`StringDtype` :class:`str` :ref:`api.array Boolean (with NA) :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool` =================== ========================= ================== ============================= +As the table shows, each extension type is associated with an array class. Pandas may define +a dedicated scalar for the type (for example, :class:`arrays.IntervalArray` uses :class:`Interval`) +or it may re-use Python's scalars (for example, :class:`StringArray` uses Python's :class:`str`). + +Additionally, pandas defines a singleton scalar missing value :class:`pandas.NA`. This +value is distinct from ``float('nan')``, :attr:`numpy.nan` and Python's :class:`None`. + +.. autosummary:: + :toctree: api/ + + NA + +In binary operations, :class:`NA` is treated as numeric. Generally, ``NA`` propagates, so +the result of ``op(NA, other)`` will be ``NA``. There are a few special cases when the +result is known, even when one of the operands is ``NA``. + +* ``pd.NA ** 0`` is always 0. +* ``1 ** pd.NA`` is always 1. + +In logical operations, :class:`NA` uses Kleene logic. + +Creating Arrays +--------------- + Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). The top-level :meth:`array` method can be used to create a new array, which may be stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFrame`. diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 30832a8e4daab..d0ead37806ae7 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -365,8 +365,6 @@ class NAType(C_NAType): __rmod__ = _create_binary_propagating_op("__rmod__") __divmod__ = _create_binary_propagating_op("__divmod__", divmod=True) __rdivmod__ = _create_binary_propagating_op("__rdivmod__", divmod=True) - __pow__ = _create_binary_propagating_op("__pow__") - __rpow__ = _create_binary_propagating_op("__rpow__") # __lshift__ and __rshift__ are not implemented __eq__ = _create_binary_propagating_op("__eq__") @@ -383,6 +381,29 @@ class NAType(C_NAType): __abs__ = _create_unary_propagating_op("__abs__") __invert__ = _create_unary_propagating_op("__invert__") + # pow has special + def __pow__(self, other): + if other is C_NA: + return NA + elif isinstance(other, (numbers.Number, np.bool_)): + if other == 0: + return other + else: + return NA + + return NotImplemented + + def __rpow__(self, other): + if other is C_NA: + return NA + elif isinstance(other, (numbers.Number, np.bool_)): + if other == 1: + return other + else: + return NA + + return NotImplemented + # Logical ops using Kleene logic def __and__(self, other): diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 586433698a587..3c29fe38b704e 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -38,11 +38,14 @@ def test_arithmetic_ops(all_arithmetic_functions): op = all_arithmetic_functions for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]: - if op.__name__ == "rmod" and isinstance(other, str): + if op.__name__ in ("pow", "rpow", "rmod") and isinstance(other, str): continue if op.__name__ in ("divmod", "rdivmod"): assert op(NA, other) is (NA, NA) else: + if op.__name__ == "rpow": + # avoid special case + other += 1 assert op(NA, other) is NA @@ -69,6 +72,25 @@ def test_comparison_ops(): assert (other <= NA) is NA +@pytest.mark.parametrize( + "value", [0, 0.0, False, np.bool_(False), np.int_(0), np.float_(0)] +) +def test_pow_special(value): + result = pd.NA ** value + assert isinstance(result, type(value)) + assert result == 0 + + +@pytest.mark.parametrize( + "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)] +) +def test_rpow_special(value): + result = value ** pd.NA + assert result == 1 + if not isinstance(value, (np.float_, np.bool_, np.int_)): + assert isinstance(result, type(value)) + + def test_unary_ops(): assert +NA is NA assert -NA is NA From 36d403dbbfd65c34dab46a719d8205579a1348c4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 6 Dec 2019 06:18:02 -0600 Subject: [PATCH 15/38] move --- doc/source/reference/arrays.rst | 9 --------- doc/source/user_guide/missing_data.rst | 6 ++++++ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 563d6f5bab833..b435aba7599dd 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -40,15 +40,6 @@ value is distinct from ``float('nan')``, :attr:`numpy.nan` and Python's :class:` NA -In binary operations, :class:`NA` is treated as numeric. Generally, ``NA`` propagates, so -the result of ``op(NA, other)`` will be ``NA``. There are a few special cases when the -result is known, even when one of the operands is ``NA``. - -* ``pd.NA ** 0`` is always 0. -* ``1 ** pd.NA`` is always 1. - -In logical operations, :class:`NA` uses Kleene logic. - Creating Arrays --------------- diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 1cc485a229123..8b4a3cbed10ff 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -832,6 +832,12 @@ return ``False``. pd.NA == pd.NA pd.NA < 2.5 +There are a few special cases when the result is known, even when one of the +operands is ``NA``. + +* ``pd.NA ** 0`` is always 0. +* ``1 ** pd.NA`` is always 1. + To check if a value is equal to ``pd.NA``, the :func:`isna` function can be used: From 945e8cd9fcd66d268519534b8c62d0f475db22e0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 6 Dec 2019 06:18:26 -0600 Subject: [PATCH 16/38] revert --- doc/source/reference/arrays.rst | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index b435aba7599dd..cf14d28772f4c 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -2,9 +2,9 @@ .. _api.arrays: -========================= -Pandas arrays and scalars -========================= +============= +Pandas arrays +============= .. currentmodule:: pandas @@ -28,21 +28,6 @@ Strings :class:`StringDtype` :class:`str` :ref:`api.array Boolean (with NA) :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool` =================== ========================= ================== ============================= -As the table shows, each extension type is associated with an array class. Pandas may define -a dedicated scalar for the type (for example, :class:`arrays.IntervalArray` uses :class:`Interval`) -or it may re-use Python's scalars (for example, :class:`StringArray` uses Python's :class:`str`). - -Additionally, pandas defines a singleton scalar missing value :class:`pandas.NA`. This -value is distinct from ``float('nan')``, :attr:`numpy.nan` and Python's :class:`None`. - -.. autosummary:: - :toctree: api/ - - NA - -Creating Arrays ---------------- - Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). The top-level :meth:`array` method can be used to create a new array, which may be stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFrame`. From 8fc8b3a7b7e60baca4508ee0f47472df5c2f4fa3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 6 Dec 2019 08:59:35 -0600 Subject: [PATCH 17/38] fixup --- doc/source/user_guide/missing_data.rst | 12 ++++++------ pandas/_libs/missing.pyx | 2 +- pandas/tests/scalar/test_na_scalar.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 8b4a3cbed10ff..bbfb143f56b9d 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -822,6 +822,12 @@ For example, ``pd.NA`` propagates in arithmetic operations, similarly to pd.NA + 1 "a" * pd.NA +There are a few special cases when the result is known, even when one of the +operands is ``NA``. + +* ``pd.NA ** 0`` is always 0. +* ``1 ** pd.NA`` is always 1. + In equality and comparison operations, ``pd.NA`` also propagates. This deviates from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always return ``False``. @@ -832,12 +838,6 @@ return ``False``. pd.NA == pd.NA pd.NA < 2.5 -There are a few special cases when the result is known, even when one of the -operands is ``NA``. - -* ``pd.NA ** 0`` is always 0. -* ``1 ** pd.NA`` is always 1. - To check if a value is equal to ``pd.NA``, the :func:`isna` function can be used: diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index d0ead37806ae7..976c2a75b635c 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -387,7 +387,7 @@ class NAType(C_NAType): return NA elif isinstance(other, (numbers.Number, np.bool_)): if other == 0: - return other + return type(other)(1) else: return NA diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 3c29fe38b704e..058e2cd9d962c 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -78,7 +78,7 @@ def test_comparison_ops(): def test_pow_special(value): result = pd.NA ** value assert isinstance(result, type(value)) - assert result == 0 + assert result == 1 @pytest.mark.parametrize( From a49aa654440c2bf75e35e0b84befa78439922ff4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 6 Dec 2019 11:02:14 -0600 Subject: [PATCH 18/38] handle negative --- doc/source/user_guide/missing_data.rst | 10 +++++++-- pandas/_libs/missing.pyx | 3 ++- pandas/tests/scalar/test_na_scalar.py | 30 +++++++++++++++++++++++--- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index bbfb143f56b9d..1bfe196cb2f89 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -825,8 +825,14 @@ For example, ``pd.NA`` propagates in arithmetic operations, similarly to There are a few special cases when the result is known, even when one of the operands is ``NA``. -* ``pd.NA ** 0`` is always 0. -* ``1 ** pd.NA`` is always 1. + +================ ====== +Operation Result +================ ====== +``pd.NA ** 0`` 0 +``1 ** pd.NA`` 1 +``-1 ** pd.NA`` -1 +================ ====== In equality and comparison operations, ``pd.NA`` also propagates. This deviates from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 976c2a75b635c..63aa5501c5250 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -387,6 +387,7 @@ class NAType(C_NAType): return NA elif isinstance(other, (numbers.Number, np.bool_)): if other == 0: + # returning positive is correct for +/- 0. return type(other)(1) else: return NA @@ -397,7 +398,7 @@ class NAType(C_NAType): if other is C_NA: return NA elif isinstance(other, (numbers.Number, np.bool_)): - if other == 1: + if other == 1 or other == -1: return other else: return NA diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 058e2cd9d962c..40db617c64717 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -73,7 +73,19 @@ def test_comparison_ops(): @pytest.mark.parametrize( - "value", [0, 0.0, False, np.bool_(False), np.int_(0), np.float_(0)] + "value", + [ + 0, + 0.0, + -0, + -0.0, + False, + np.bool_(False), + np.int_(0), + np.float_(0), + np.int_(-0), + np.float_(-0), + ], ) def test_pow_special(value): result = pd.NA ** value @@ -82,11 +94,23 @@ def test_pow_special(value): @pytest.mark.parametrize( - "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)] + "value", + [ + 1, + 1.0, + -1, + -1.0, + True, + np.bool_(True), + np.int_(1), + np.float_(1), + np.int_(-1), + np.float_(-1), + ], ) def test_rpow_special(value): result = value ** pd.NA - assert result == 1 + assert result == value if not isinstance(value, (np.float_, np.bool_, np.int_)): assert isinstance(result, type(value)) From 88fa4120745f744f3344efa37283d2a47712ae25 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 6 Dec 2019 13:28:35 -0600 Subject: [PATCH 19/38] expand test --- pandas/tests/arrays/test_integer.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index ce3197ce071fc..055d54495725c 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -448,15 +448,13 @@ def test_array(self, all_compare_operators): b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") ) - def test_compare_boolean_array(self): - left = pd.array([0, 1, None, None], dtype="Int64") - right = pd.array([True, True, False, None], dtype="boolean") - expected = pd.array([False, True, None, None], dtype="boolean") - - result = left == right - tm.assert_extension_array_equal(result, expected) - - result = right == left + def test_compare_with_booleanarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None] * 3, dtype="boolean") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64") + other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") + expected = op(a, other) + result = op(a, b) tm.assert_extension_array_equal(result, expected) From 0902eef8ab46e2eb8388ea4ad93c2f967c4fcfe9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 6 Dec 2019 15:32:45 -0600 Subject: [PATCH 20/38] wip --- pandas/core/arrays/integer.py | 2 +- pandas/tests/arrays/test_integer.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 32d3b1db732e1..90e2c5fac0672 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -785,7 +785,7 @@ def integer_arithmetic_method(self, other): # 1 ** np.nan is 1. So we have to unmask those. if op_name == "pow": - mask = np.where(self._data == 1, False, mask) + mask = np.where((self._data == 1) & self._mask, False, mask) elif op_name == "rpow": mask = np.where(other == 1, False, mask) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 055d54495725c..22ca6d066aa45 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -189,10 +189,6 @@ def _check_op_float(self, result, expected, mask, s, op_name, other): # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) mask2 = np.isinf(expected) & np.isnan(result) expected[mask2] = np.nan - if op_name == "__pow__": - # https://github.com/pandas-dev/pandas/issues/29997 - # unclear what 1 ** NA is. - pytest.skip(msg="GH-29997") tm.assert_series_equal(result, expected) def _check_op_integer(self, result, expected, mask, s, op_name, other): From c6583076d06ea493e3f56839134b22e37c6341ca Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Dec 2019 09:47:17 -0600 Subject: [PATCH 21/38] fixup --- pandas/tests/base/test_conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 8fa52af832907..4b6349a505509 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -315,7 +315,7 @@ def test_array_multiindex_raises(): ), ( pd.core.arrays.integer_array([0, np.nan]), - np.array([0, np.nan], dtype=object), + np.array([0, pd.NA], dtype=object), ), ( pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), From 4f9d7752c8ac1becab544636e86a5687848ffcb0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Dec 2019 10:02:40 -0600 Subject: [PATCH 22/38] exceptions --- pandas/tests/arrays/test_integer.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 22ca6d066aa45..5d9b1a7da1b8b 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -261,16 +261,16 @@ def test_arith_integer_array(self, data, all_arithmetic_operators): rhs = pd.Series([1] * len(data), dtype=data.dtype) rhs.iloc[-1] = np.nan - if op in {"__pow__", "__rpow__"}: - pytest.skip("TODO") + # if op in {"__pow__", "__rpow__"}: + # pytest.skip("TODO") self._check_op(s, op, rhs) def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # scalar op = all_arithmetic_operators - if op in {"__pow__", "__rpow__"}: - pytest.skip("TODO") + # if op in {"__pow__", "__rpow__"}: + # pytest.skip("TODO") s = pd.Series(data) self._check_op(s, op, 1, exc=TypeError) @@ -278,8 +278,8 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op = all_arithmetic_operators - if op in {"__pow__", "__rpow__"}: - pytest.skip("TODO") + # if op in {"__pow__", "__rpow__"}: + # pytest.skip("TODO") df = pd.DataFrame({"A": data}) self._check_op(df, op, 1, exc=TypeError) @@ -287,8 +287,8 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op = all_arithmetic_operators - if op in {"__pow__", "__rpow__"}: - pytest.skip("TODO") + # if op in {"__pow__", "__rpow__"}: + # pytest.skip("TODO") s = pd.Series(data) other = np.ones(len(s), dtype=s.dtype.type) @@ -297,8 +297,8 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): def test_arith_coerce_scalar(self, data, all_arithmetic_operators): op = all_arithmetic_operators - if op in {"__pow__", "__rpow__"}: - pytest.skip("TODO") + # if op in {"__pow__", "__rpow__"}: + # pytest.skip("TODO") s = pd.Series(data) @@ -528,8 +528,7 @@ def test_astype(self, all_data): # coerce to same numpy_dtype - mixed s = pd.Series(mixed) - with pytest.raises(TypeError): - # XXX: Should this be TypeError or ValueError? + with pytest.raises(ValueError): s.astype(all_data.dtype.numpy_dtype) # coerce to object @@ -815,11 +814,9 @@ def test_reduce_to_float(op): def test_astype_nansafe(): # see gh-22343 arr = integer_array([np.nan, 1, 2], dtype="Int8") - # XXX: determine the proper exception here, from int(NA). - # msg = "cannot convert float NaN to integer" - msg = "" + msg = "cannot convert to integer NumPy array with missing values" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): arr.astype("uint32") From 1244ef49df6bdfcbc33c1cf0400f2046dc7824ef Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Dec 2019 14:02:23 -0600 Subject: [PATCH 23/38] wip --- pandas/core/arrays/integer.py | 36 +++++++++++----- pandas/tests/arrays/test_integer.py | 64 +++++++++++++++++------------ 2 files changed, 64 insertions(+), 36 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index cd6a077d13313..e77e58e3100e0 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -749,13 +749,13 @@ def _create_arithmetic_method(cls, op): def integer_arithmetic_method(self, other): # nans propagate - mask = None + omask = None if getattr(other, "ndim", 0) > 1: raise NotImplementedError("can only perform ops with 1-d structures") if isinstance(other, IntegerArray): - other, mask = other._data, other._mask + other, omask = other._data, other._mask elif is_list_like(other): other = np.asarray(other) @@ -769,23 +769,39 @@ def integer_arithmetic_method(self, other): raise TypeError("can only perform ops with numeric values") else: - if not (is_float(other) or is_integer(other)): + if not (is_float(other) or is_integer(other) or other is libmissing.NA): raise TypeError("can only perform ops with numeric values") - if mask is None: + if omask is None: mask = self._mask.copy() + if other is libmissing.NA: + mask |= True else: - mask = self._mask | mask + mask = self._mask | omask - # 1 ** np.nan is 1. So we have to unmask those. if op_name == "pow": - mask = np.where((self._data == 1) & self._mask, False, mask) + # 1 ** x is 1. + mask = np.where((self._data == 1) & ~self._mask, False, mask) + # x ** 0 is 1. + if omask is not None: + mask = np.where((other == 0) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 0, False, mask) elif op_name == "rpow": - mask = np.where(other == 1, False, mask) + # 1 ** x is 1. + if omask is not None: + mask = np.where((other == 1) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 1, False, mask) + # x ** 0 is 1. + mask = np.where((self._data == 0) & ~self._mask, False, mask) - with np.errstate(all="ignore"): - result = op(self._data, other) + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) # divmod returns a tuple if op_name == "divmod": diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 5d9b1a7da1b8b..9f7730bc18c1a 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -139,8 +139,13 @@ def _check_divmod_op(self, s, op, other, exc=None): def _check_op(self, s, op_name, other, exc=None): op = self.get_op_from_name(op_name) + # XXX: On master, this was mutating `s` inplace for rtrudiv. + # The 0 was being turned into a NaN, most likely via the mask. result = op(s, other) + if op_name == "__rtruediv__": + pytest.skip(msg="TODO: what's expected?") + # compute expected mask = s.isna() @@ -158,12 +163,8 @@ def _check_op(self, s, op_name, other, exc=None): if omask is not None: mask |= omask - # 1 ** na is na, so need to unmask those - if op_name == "__pow__": - mask = np.where(s.to_numpy() == 1, False, mask) - - elif op_name == "__rpow__": - mask = np.where(other == 1, False, mask) + if op_name in {"__pow__", "__rpow__"}: + pytest.skip("tested elsewhere") # float result type or float op if ( @@ -261,35 +262,23 @@ def test_arith_integer_array(self, data, all_arithmetic_operators): rhs = pd.Series([1] * len(data), dtype=data.dtype) rhs.iloc[-1] = np.nan - # if op in {"__pow__", "__rpow__"}: - # pytest.skip("TODO") - self._check_op(s, op, rhs) def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # scalar op = all_arithmetic_operators - # if op in {"__pow__", "__rpow__"}: - # pytest.skip("TODO") - s = pd.Series(data) self._check_op(s, op, 1, exc=TypeError) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op = all_arithmetic_operators - # if op in {"__pow__", "__rpow__"}: - # pytest.skip("TODO") - df = pd.DataFrame({"A": data}) self._check_op(df, op, 1, exc=TypeError) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op = all_arithmetic_operators - # if op in {"__pow__", "__rpow__"}: - # pytest.skip("TODO") - s = pd.Series(data) other = np.ones(len(s), dtype=s.dtype.type) self._check_op(s, op, other, exc=TypeError) @@ -297,9 +286,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): def test_arith_coerce_scalar(self, data, all_arithmetic_operators): op = all_arithmetic_operators - # if op in {"__pow__", "__rpow__"}: - # pytest.skip("TODO") - s = pd.Series(data) other = 0.01 @@ -362,13 +348,39 @@ def test_error(self, data, all_arithmetic_operators): with pytest.raises(NotImplementedError): opa(np.arange(len(s)).reshape(-1, len(s))) - @pytest.mark.xfail(reason="GH-29997") - def test_pow(self): + def test_pow_scalar(self): + a = pd.array([0, 1, None, 2], dtype="Int64") + result = a ** 0 + expected = pd.array([1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** 1 + expected = pd.array([0, 1, None, 2], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** pd.NA + expected = pd.array([None, 1, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + # reversed + result = 0 ** a + expected = pd.array([1, 0, None, 0], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = 1 ** a + expected = pd.array([1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + def test_pow_array(self): # https://github.com/pandas-dev/pandas/issues/22022 - a = integer_array([1, np.nan, np.nan, 1]) - b = integer_array([1, np.nan, 1, np.nan]) + a = integer_array([0, 0, 0, 1, 1, 1, None, None, None]) + b = integer_array([0, 1, None, 0, 1, None, 0, 1, None]) result = a ** b - expected = pd.core.arrays.integer_array([1, np.nan, np.nan, 1]) + expected = integer_array([1, 0, None, 1, 1, 1, 1, None, None]) tm.assert_extension_array_equal(result, expected) def test_rpow_one_to_na(self): From 5293d87fda191b8170fb3c54565b369587beb398 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Dec 2019 14:37:28 -0600 Subject: [PATCH 24/38] fixup --- pandas/core/arrays/integer.py | 2 +- pandas/tests/arrays/test_integer.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e77e58e3100e0..93b483827723f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -390,7 +390,7 @@ def _coerce_to_ndarray(self, dtype=None, na_value=libmissing.NA): if is_integer_dtype(dtype): if not self.isna().any(): - return self._data + return self._data.astype(dtype) else: raise ValueError( "cannot convert to integer NumPy array with missing values" diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 9f7730bc18c1a..28029e37eb6d5 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -549,6 +549,17 @@ def test_astype(self, all_data): expected = pd.Series(np.asarray(mixed)) tm.assert_series_equal(result, expected) + def test_astype_to_larger_numpy(self): + a = pd.array([1, 2], dtype="Int32") + result = a.astype("int64") + expected = np.array([1, 2], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + a = pd.array([1, 2], dtype="UInt32") + result = a.astype("uint64") + expected = np.array([1, 2], dtype="uint64") + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) def test_astype_specific_casting(self, dtype): s = pd.Series([1, 2, 3], dtype="Int64") From 39f225a6a1ca389b1523a9a242054659f5b00e2f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Dec 2019 15:09:31 -0600 Subject: [PATCH 25/38] arrow --- pandas/tests/arrays/test_integer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 28029e37eb6d5..06e4b5279fff0 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -912,7 +912,9 @@ def test_arrow_array(data): import pyarrow as pa arr = pa.array(data) - expected = pa.array(list(data), type=data.dtype.name.lower(), from_pandas=True) + expected = np.array(data, dtype=object) + expected[data.isna()] = None + expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) assert arr.equals(expected) From ea19b2d81ba2c49dcca8f72aea8c48033491f722 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Dec 2019 15:52:06 -0600 Subject: [PATCH 26/38] update --- pandas/core/arrays/integer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 93b483827723f..ed5515fa29088 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -829,7 +829,6 @@ def integer_arithmetic_method(self, other): Now uses :attr:`pandas.NA` as its missing value, rather than :attr:`numpy.nan`. - Attributes ---------- None From 68fe1551445fddaa9cdd38b5aac925781f75168b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 10 Dec 2019 11:55:19 -0600 Subject: [PATCH 27/38] update --- pandas/core/arrays/integer.py | 16 +++++++++++----- pandas/tests/arrays/test_integer.py | 5 +++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index b60f505d0b4d0..9d9dbf6b34392 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -377,19 +377,21 @@ def __getitem__(self, item): return self._data[item] return type(self)(self._data[item], self._mask[item]) - def _coerce_to_ndarray(self, dtype=None, na_value=libmissing.NA): + def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default): """ coerce to an ndarary of object dtype """ - # TODO(jreback) make this better if dtype is None: dtype = object - elif is_float_dtype(dtype) and na_value is libmissing.NA: - # XXX: Do we want to implicitly treat NA as NaN here? - # We should be deliberate in this decision. + + if is_float_dtype(dtype) and na_value is lib._no_default: na_value = np.nan + else: + na_value = libmissing.NA if is_integer_dtype(dtype): + # Specifically, a NumPy integer dtype, not a pandas integer dtype, + # since we're coercing to a numpy dtype by definition in this function. if not self.isna().any(): return self._data.astype(dtype) else: @@ -627,6 +629,8 @@ def value_counts(self, dropna=True): return Series(array, index=index) def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + # TODO: https://github.com/pandas-dev/pandas/issues/30037 + # use masked algorithms, rather than object-dtype / np.nan. return self._coerce_to_ndarray(na_value=np.nan), np.nan def _values_for_argsort(self) -> np.ndarray: @@ -670,6 +674,8 @@ def cmp_method(self, other): if other is libmissing.NA: # numpy does not handle pd.NA well as "other" scalar (it returns # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. result = np.zeros(self._data.shape, dtype="bool") mask = np.ones(self._data.shape, dtype="bool") else: diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 06e4b5279fff0..0a669a7892dc0 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -589,6 +589,11 @@ def test_construct_cast_invalid(self, dtype): with pytest.raises(TypeError, match=msg): pd.Series(arr).astype(dtype) + def test_coerce_to_ndarray_float_NA_rasies(self): + a = pd.array([0, 1, 2], dtype="Int64") + with pytest.raises(TypeError, match="NAType"): + a._coerce_to_ndarray(dtype="float", na_value=pd.NA) + def test_frame_repr(data_missing): From f27a5c2f9ae2de46cbd5992b7f57c1141e9ee01c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 10 Dec 2019 12:01:44 -0600 Subject: [PATCH 28/38] fixup --- pandas/core/arrays/integer.py | 4 ++-- pandas/tests/extension/test_integer.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 9d9dbf6b34392..a4aa43dd4a49a 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -384,9 +384,9 @@ def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default): if dtype is None: dtype = object - if is_float_dtype(dtype) and na_value is lib._no_default: + if na_value is lib._no_default and is_float_dtype(dtype): na_value = np.nan - else: + elif na_value is lib._no_default: na_value = libmissing.NA if is_integer_dtype(dtype): diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 85a7087eb0f48..74b926f9a57b0 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -145,7 +145,8 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): if op_name in {"__pow__", "__rpow__"}: # TODO: https://github.com/pandas-dev/pandas/issues/29997 # pow(1, NA) is NA or 1? - pytest.skip("TODO-29997") + # pytest.skip("TODO-29997") + pass if (op_name == "__rpow__") and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA From 5d62af8f4f87e17c91e89cd423029ccbafe4fa43 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Dec 2019 14:46:23 -0600 Subject: [PATCH 29/38] updates --- pandas/tests/arrays/test_integer.py | 9 +++++---- pandas/tests/extension/test_integer.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 2a9e7720a3ba2..57a98bc313700 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -490,20 +490,21 @@ def test_compare_to_string(self, any_nullable_int_dtype): # GH 28930 s = pd.Series([1, None], dtype=any_nullable_int_dtype) result = s == "a" - expected = pd.Series([False, False]) + expected = pd.Series([False, pd.NA], dtype="boolean") self.assert_series_equal(result, expected) def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators): # GH 28930 - s1 = pd.Series([1, 2, 3], dtype=any_nullable_int_dtype) - s2 = pd.Series([1, 2, 3], dtype="int") + s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype) + s2 = pd.Series([1, None, 3], dtype="float") method = getattr(s1, all_compare_operators) result = method(2) method = getattr(s2, all_compare_operators) - expected = method(2) + expected = method(2).astype("boolean") + expected[s2.isna()] = pd.NA self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 55ea37e0f1f9e..9556a882dd136 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -168,6 +168,16 @@ def test_error(self, data, all_arithmetic_operators): class TestComparisonOps(base.BaseComparisonOpsTests): + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + result = op(s, other) + # Override to do the astype to boolean + expected = s.combine(other, op).astype("boolean") + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None) From 2bf57d690932773d3527a60cd7c2566d2d4270ca Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Dec 2019 15:02:42 -0600 Subject: [PATCH 30/38] test, repr --- pandas/core/arrays/integer.py | 8 -------- pandas/tests/arrays/test_integer.py | 9 +++------ 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 423e8b0da1ea5..712746a7ccef0 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -363,14 +363,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) - def _formatter(self, boxed=False): - def fmt(x): - if isna(x): - return "NaN" - return str(x) - - return fmt - def __getitem__(self, item): if is_integer(item): if self._mask[item]: diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 57a98bc313700..0279b8b9ad145 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -90,7 +90,7 @@ def test_repr_dtype(dtype, expected): def test_repr_array(): result = repr(integer_array([1, None, 3])) - expected = "\n[1, NaN, 3]\nLength: 3, dtype: Int64" + expected = "\n[1, NA, 3]\nLength: 3, dtype: Int64" assert result == expected @@ -98,9 +98,9 @@ def test_repr_array_long(): data = integer_array([1, 2, None] * 1000) expected = ( "\n" - "[ 1, 2, NaN, 1, 2, NaN, 1, 2, NaN, 1,\n" + "[ 1, 2, NA, 1, 2, NA, 1, 2, NA, 1,\n" " ...\n" - " NaN, 1, 2, NaN, 1, 2, NaN, 1, 2, NaN]\n" + " NA, 1, 2, NA, 1, 2, NA, 1, 2, NA]\n" "Length: 3000, dtype: Int64" ) result = repr(data) @@ -143,9 +143,6 @@ def _check_op(self, s, op_name, other, exc=None): # The 0 was being turned into a NaN, most likely via the mask. result = op(s, other) - if op_name == "__rtruediv__": - pytest.skip(msg="TODO: what's expected?") - # compute expected mask = s.isna() From 021dc7b1aa7473429645088ce9d27bd103e47af7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 17 Dec 2019 11:17:22 -0600 Subject: [PATCH 31/38] fixup --- pandas/core/arrays/integer.py | 4 ++-- pandas/core/ops/__init__.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 712746a7ccef0..290aaf3aafebb 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -645,12 +645,13 @@ def _values_for_argsort(self) -> np.ndarray: @classmethod def _create_comparison_method(cls, op): + op_name = op.__name__ + @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): from pandas.arrays import BooleanArray mask = None - op_name = op.__name__ if isinstance(other, (BooleanArray, IntegerArray)): other, mask = other._data, other._mask @@ -745,7 +746,6 @@ def _create_arithmetic_method(cls, op): @unpack_zerodim_and_defer(op.__name__) def integer_arithmetic_method(self, other): - # nans propagate omask = None diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 561b114b3d202..ffa38cbc3d658 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -10,6 +10,7 @@ import numpy as np from pandas._libs import Timedelta, Timestamp, lib +from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_list_like, is_timedelta64_dtype from pandas.core.dtypes.generic import ( @@ -20,8 +21,6 @@ ) from pandas.core.dtypes.missing import isna -# ----------------------------------------------------------------------------- -# Ops Wrapping Utilities from pandas.core.construction import extract_array from pandas.core.ops.array_ops import ( arithmetic_op, @@ -59,7 +58,9 @@ rtruediv, rxor, ) -from pandas.util import Appender + +# ----------------------------------------------------------------------------- +# Ops Wrapping Utilities def get_op_result_name(left, right): From 197f18b8b46aa2f3592f47d96618ea83398e8977 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 17 Dec 2019 11:22:15 -0600 Subject: [PATCH 32/38] enable --- pandas/tests/arrays/test_integer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 0279b8b9ad145..772ec289d7ca3 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -139,8 +139,6 @@ def _check_divmod_op(self, s, op, other, exc=None): def _check_op(self, s, op_name, other, exc=None): op = self.get_op_from_name(op_name) - # XXX: On master, this was mutating `s` inplace for rtrudiv. - # The 0 was being turned into a NaN, most likely via the mask. result = op(s, other) # compute expected @@ -160,8 +158,15 @@ def _check_op(self, s, op_name, other, exc=None): if omask is not None: mask |= omask - if op_name in {"__pow__", "__rpow__"}: - pytest.skip("tested elsewhere") + # 1 ** na is na, so need to unmask those + if op_name == "__pow__": + mask = np.where(~s.isna() & (s == 1), False, mask) + + elif op_name == "__rpow__": + other_is_one = other == 1 + if isinstance(other_is_one, pd.Series): + other_is_one = other_is_one.fillna(False) + mask = np.where(other_is_one, False, mask) # float result type or float op if ( From 259b779ba51b30d4d563ae91dffbb13726b9bd9c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 17 Dec 2019 11:24:35 -0600 Subject: [PATCH 33/38] fixup --- pandas/tests/extension/test_integer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 9556a882dd136..8e54543e5437c 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -142,11 +142,6 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass - if op_name in {"__pow__", "__rpow__"}: - # TODO: https://github.com/pandas-dev/pandas/issues/29997 - # pow(1, NA) is NA or 1? - # pytest.skip("TODO-29997") - pass if (op_name == "__rpow__") and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA From 3183d539023c302e03efad5cb330b819c2be55d9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 18 Dec 2019 09:47:07 -0600 Subject: [PATCH 34/38] ints --- pandas/tests/arrays/test_integer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 772ec289d7ca3..c8c09b187f69b 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -436,7 +436,7 @@ def _compare_other(self, data, op_name, other): tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("other", [True, False, pd.NA]) + @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) def test_scalar(self, other, all_compare_operators): op = self.get_op_from_name(all_compare_operators) a = pd.array([1, 0, None], dtype="Int64") From 4986d8497ba45a552b504a110fa52aa783880bd7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 18 Dec 2019 11:19:44 -0600 Subject: [PATCH 35/38] restore comment --- pandas/core/arrays/integer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 290aaf3aafebb..68ea23825b6b0 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -674,6 +674,11 @@ def cmp_method(self, other): mask = np.ones(self._data.shape, dtype="bool") else: with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. warnings.filterwarnings("ignore", "elementwise", FutureWarning) with np.errstate(all="ignore"): method = getattr(self._data, f"__{op_name}__") From b39dc604f0a2324278784e8cfc15b03353520e7f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Dec 2019 08:36:12 -0600 Subject: [PATCH 36/38] docs --- doc/source/whatsnew/v1.0.0.rst | 58 ++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a6ba7770dadcc..6c750be7eea79 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -365,6 +365,64 @@ The following methods now also correctly output values for unobserved categories As a reminder, you can specify the ``dtype`` to disable all inference. +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` rather than +:attr:`numpy.nan` as its missing value marker (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a[2] + nan + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64") + a[2] + +See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA` +and :attr:`numpy.nan`. + +:class:`arrays.IntegerArray` comparisons return :class:`arrays.BooleanArray` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Comparison operations on a :class:`arrays.IntegerArray` now returns a +:class:`arrays.BooleanArray` rather than a NumPy array (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a > 1 + array([False, True, False]) + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64"`` + a > 1 + +Note that missing values now propagate, rather than always comparing unequal +like :attr:`numpy.nan`. See :ref:`missing_data.NA` for more. + By default :meth:`Categorical.min` now returns the minimum instead of np.nan ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 800158d45fda2b01b26fc414849d5759060b9de3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Dec 2019 08:38:35 -0600 Subject: [PATCH 37/38] docs --- doc/source/user_guide/integer_na.rst | 5 +++++ pandas/core/arrays/integer.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index f08cbcf0dc698..a45d7a4fa1547 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -15,6 +15,11 @@ Nullable integer data type IntegerArray is currently experimental. Its API or implementation may change without warning. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. + In :ref:`missing_data`, we saw that pandas primarily uses ``NaN`` to represent missing data. Because ``NaN`` is a float, this forces an array of integers with any missing values to become floating point. In some cases, this may not matter diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index ad2a4c9ce36a7..62f31addedc0b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -265,7 +265,7 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin): .. versionchanged:: 1.0.0 - Now uses :attr:`pandas.NA` as its missing value, rather + Now uses :attr:`pandas.NA` as the missing value rather than :attr:`numpy.nan`. .. warning:: From e5d6832f7f7f21a3a1f19ab0f56eaa48c25845de Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Dec 2019 10:13:45 -0600 Subject: [PATCH 38/38] fixup --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6c750be7eea79..8755abe642068 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -417,7 +417,7 @@ Comparison operations on a :class:`arrays.IntegerArray` now returns a .. ipython:: python - a = pd.array([1, 2, None], dtype="Int64"`` + a = pd.array([1, 2, None], dtype="Int64") a > 1 Note that missing values now propagate, rather than always comparing unequal