From 6fffb0262fe2aa0dd5fa4fc3547857c8bc53bd9f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 8 Jan 2021 14:28:03 -0800 Subject: [PATCH 1/7] BUG/API: make setitem-inplace preserve dtype when possible with PandasArray, IntegerArray, FloatingArray --- pandas/core/dtypes/missing.py | 19 ++++++++++- pandas/core/indexing.py | 2 +- pandas/core/internals/blocks.py | 45 ++++++++++++++++++++----- pandas/tests/extension/test_floating.py | 23 +++++++++---- pandas/tests/extension/test_integer.py | 29 ++++++++++++---- pandas/tests/extension/test_numpy.py | 33 ++++++++++++++++++ 6 files changed, 129 insertions(+), 22 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f0455c01fa085..a30875a2783d2 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -506,7 +506,7 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool: return array_equivalent(left, right, dtype_equal=True) -def infer_fill_value(val): +def infer_fill_value(val, length: int): """ infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like if we are a NaT, return the correct dtyped @@ -514,6 +514,23 @@ def infer_fill_value(val): """ if not is_list_like(val): val = [val] + + if type(val).__name__ == "PandasArray": + # for test_numpy test where we patch PandasArray._typ + val = val.to_numpy() + + if is_extension_array_dtype(val): + # We cannot use dtype._na_value bc pd.NA/pd.NaT do not preserve dtype + if len(val) == length: + # TODO: in this case see if we can avoid making a copy later on + return val + if length == 0: + return val[:0].copy() + + dtype = val.dtype + cls = dtype.construct_array_type() + return cls._from_sequence([dtype._na_value], dtype=dtype).repeat(length) + val = np.array(val, copy=False) if needs_i8_conversion(val.dtype): return np.array("NaT", dtype=val.dtype) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 12694c19b2173..0338f8f8810f4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1601,7 +1601,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # We are setting an entire column self.obj[key] = value else: - self.obj[key] = infer_fill_value(value) + self.obj[key] = infer_fill_value(value, len(self.obj)) new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 94c7d325d0bc8..27d01e896ad64 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -65,8 +65,11 @@ Categorical, DatetimeArray, ExtensionArray, + FloatingArray, + IntegerArray, PandasArray, PandasDtype, + StringArray, TimedeltaArray, ) from pandas.core.base import PandasObject @@ -621,10 +624,17 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): ) raise TypeError(msg) + values = self.values dtype = pandas_dtype(dtype) + if isinstance(dtype, ExtensionDtype) and self.values.ndim == 2: + # TODO(EA2D): kludge not needed with 2D EAs (astype_nansafe would raise) + # note DataFrame.astype has special handling to avoid getting here + if self.shape[0] != 1: + raise NotImplementedError("Need 2D EAs!") + values = values[0] try: - new_values = self._astype(dtype, copy=copy) + new_values = self._astype(values, dtype, copy=copy) except (ValueError, TypeError): # e.g. astype_nansafe can fail on object-dtype of strings # trying to convert to float @@ -643,8 +653,8 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): ) return newb - def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike: - values = self.values + @staticmethod + def _astype(values, dtype: DtypeObj, copy: bool) -> ArrayLike: if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) @@ -936,6 +946,16 @@ def setitem(self, indexer, value): return self.astype(dtype).setitem(indexer, value) + value = extract_array(value, extract_numpy=True) + if isinstance(value, PandasArray) and not isinstance(value, StringArray): + # this is redundant with extract_array except for PandasArray tests + # that patch _typ + value = value.to_numpy() + + if isinstance(value, (IntegerArray, FloatingArray)) and not value._mask.any(): + # GH#38896 + value = value.to_numpy(value.dtype.numpy_dtype) + # value must be storable at this moment if is_extension_array_dtype(getattr(value, "dtype", None)): # We need to be careful not to allow through strings that @@ -1910,6 +1930,10 @@ class FloatBlock(NumericBlock): def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: + if isinstance(element, (IntegerArray, FloatingArray)): + # GH#38896 + if element._mask.any(): + return False return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass( tipo.type, np.timedelta64 ) @@ -1975,11 +1999,16 @@ class IntBlock(NumericBlock): def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: - return ( - issubclass(tipo.type, np.integer) - and not issubclass(tipo.type, np.timedelta64) - and self.dtype.itemsize >= tipo.itemsize - ) + if issubclass(tipo.type, np.integer): + if isinstance(element, IntegerArray): + # GH#38896 + if element._mask.any(): + return False + + return ( + not issubclass(tipo.type, np.timedelta64) + and self.dtype.itemsize >= tipo.itemsize + ) # We have not inferred an integer from the dtype # check if we have a builtin int or a float equal to an int return is_integer(element) or (is_float(element) and element.is_integer()) diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index c08c31e90fecc..2688998096a61 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -24,12 +24,22 @@ from pandas.tests.extension import base -def make_data(): +def make_data(with_nas: bool = True): + if with_nas: + return ( + list(np.arange(0.1, 0.9, 0.1)) + + [pd.NA] + + list(np.arange(1, 9.8, 0.1)) + + [pd.NA] + + [9.9, 10.0] + ) + # case without pd.NA that can be cast to floating ndarray losslessly + # TODO: get cases with np.nan instead of pd.NA GH#39039 return ( list(np.arange(0.1, 0.9, 0.1)) - + [pd.NA] + + [np.nan] + list(np.arange(1, 9.8, 0.1)) - + [pd.NA] + + [np.nan] + [9.9, 10.0] ) @@ -39,9 +49,10 @@ def dtype(request): return request.param() -@pytest.fixture -def data(dtype): - return pd.array(make_data(), dtype=dtype) +@pytest.fixture(params=[True, False]) +def data(dtype, request): + with_nas = request.param + return pd.array(make_data(with_nas), dtype=dtype) @pytest.fixture diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 99a32203053c6..1c944a9fccea8 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -33,8 +33,11 @@ from pandas.tests.extension import base -def make_data(): - return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] +def make_data(with_nas: bool = True): + if with_nas: + return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] + + return list(range(1, 101)) @pytest.fixture( @@ -53,9 +56,10 @@ def dtype(request): return request.param() -@pytest.fixture -def data(dtype): - return pd.array(make_data(), dtype=dtype) +@pytest.fixture(params=[True, False]) +def data(dtype, request): + with_nas = request.param + return pd.array(make_data(with_nas), dtype=dtype) @pytest.fixture @@ -198,7 +202,20 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): - pass + def test_setitem_series(self, data, full_indexer): + # https://github.com/pandas-dev/pandas/issues/32395 + ser = expected = pd.Series(data, name="data") + result = pd.Series(index=ser.index, dtype=object, name="data") + + key = full_indexer(ser) + result.loc[key] = ser + + if not data._mask.any(): + # GH#38896 like we do with ndarray, we set the values inplace + # but cast to the new numpy dtype + expected = pd.Series(data.to_numpy(data.dtype.numpy_dtype), name="data") + + self.assert_series_equal(result, expected) class TestMissing(base.BaseMissingTests): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 1f0181eec8830..6b3b0501f9f8f 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -501,6 +501,39 @@ def test_setitem_slice(self, data, box_in_series): def test_setitem_loc_iloc_slice(self, data): super().test_setitem_loc_iloc_slice(data) + def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request): + # https://github.com/pandas-dev/pandas/issues/32395 + df = pd.DataFrame({"data": pd.Series(data)}) + result = pd.DataFrame(index=df.index) + + key = full_indexer(df) + result.loc[key, "data"] = df["data"]._values + + # For PandasArray we expect to get unboxed to numpy + expected = pd.DataFrame({"data": data.to_numpy()}) + if isinstance(key, slice) and ( + key == slice(None) or data.dtype.numpy_dtype == object + ): + mark = pytest.mark.xfail( + reason="This case goes through a different code path" + ) + # Other cases go through Block.setitem + request.node.add_marker(mark) + + self.assert_frame_equal(result, expected) + + def test_setitem_series(self, data, full_indexer): + # https://github.com/pandas-dev/pandas/issues/32395 + ser = pd.Series(data, name="data") + result = pd.Series(index=ser.index, dtype=object, name="data") + + key = full_indexer(ser) + result.loc[key] = ser + + # For PandasArray we expect to get unboxed to numpy + expected = pd.Series(data.to_numpy(), name="data") + self.assert_series_equal(result, expected) + @skip_nested class TestParsing(BaseNumPyTests, base.BaseParsingTests): From de107082ddb410bff174555a1ec9ca82e8c71511 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 13 Jan 2021 09:53:06 -0800 Subject: [PATCH 2/7] do patching inside test_numpy --- pandas/core/dtypes/missing.py | 4 ---- pandas/tests/extension/test_numpy.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index a30875a2783d2..6bf696f39d317 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -515,10 +515,6 @@ def infer_fill_value(val, length: int): if not is_list_like(val): val = [val] - if type(val).__name__ == "PandasArray": - # for test_numpy test where we patch PandasArray._typ - val = val.to_numpy() - if is_extension_array_dtype(val): # We cannot use dtype._na_value bc pd.NA/pd.NaT do not preserve dtype if len(val) == length: diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 6b3b0501f9f8f..e4608f069f591 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -16,6 +16,8 @@ import numpy as np import pytest +from pandas.core.dtypes.missing import infer_fill_value as infer_fill_value_orig + import pandas as pd import pandas._testing as tm from pandas.core.arrays.numpy_ import PandasArray, PandasDtype @@ -28,6 +30,16 @@ def dtype(request): return PandasDtype(np.dtype(request.param)) +def infer_fill_value(val, length: int): + # GH#39044 we have to patch core.dtypes.missing.infer_fill_value + # to unwrap PandasArray bc it won't recognize PandasArray with + # is_extension_dtype + if isinstance(val, PandasArray): + val = val.to_numpy() + + return infer_fill_value_orig(val, length) + + @pytest.fixture def allow_in_pandas(monkeypatch): """ @@ -47,6 +59,7 @@ def allow_in_pandas(monkeypatch): """ with monkeypatch.context() as m: m.setattr(PandasArray, "_typ", "extension") + m.setattr(pd.core.indexing, "infer_fill_value", infer_fill_value) yield From 284f36aaada92b50aba60fb3b67fe04e740a2b9e Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Jan 2021 10:28:30 -0800 Subject: [PATCH 3/7] move kludge to test_numpy --- pandas/core/internals/blocks.py | 8 -------- pandas/tests/extension/test_numpy.py | 6 +++++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 780d0c732c31b..2e7ee5f5e5e17 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -68,7 +68,6 @@ IntegerArray, PandasArray, PandasDtype, - StringArray, TimedeltaArray, ) from pandas.core.base import PandasObject @@ -935,13 +934,6 @@ def setitem(self, indexer, value): return self.astype(dtype).setitem(indexer, value) value = extract_array(value, extract_numpy=True) - if isinstance(value, PandasArray) and not isinstance(value, StringArray): - # this is redundant with extract_array except for PandasArray tests - # that patch _typ - value = value.to_numpy() - if self.ndim == 2 and value.ndim == 1: - # TODO(EA2D): special case not needed with 2D EAs - value = np.atleast_2d(value)#.T if isinstance(value, (IntegerArray, FloatingArray)) and not value._mask.any(): # GH#38896 diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 5f197abc3ccc5..13dc306fed6c5 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -40,6 +40,10 @@ def setitem(self, indexer, value): value = extract_array(value, extract_numpy=True) if isinstance(value, PandasArray) and not isinstance(value, StringArray): value = value.to_numpy() + if self.ndim == 2 and value.ndim == 1: + # TODO(EA2D): special case not needed with 2D EAs + value = np.atleast_2d(value) + return orig_setitem(self, indexer, value) @@ -73,7 +77,7 @@ def allow_in_pandas(monkeypatch): with monkeypatch.context() as m: m.setattr(PandasArray, "_typ", "extension") m.setattr(pd.core.indexing, "infer_fill_value", infer_fill_value) - #m.setattr(pd.core.internals.Block, "setitem", setitem) + m.setattr(pd.core.internals.Block, "setitem", setitem) yield From b2aa366a7739107fdb269450e27a53b1bd1fc482 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Jan 2021 18:16:10 -0800 Subject: [PATCH 4/7] staticmethod -> function --- pandas/core/internals/blocks.py | 42 +++++++++++++------------ pandas/tests/extension/test_floating.py | 17 ++++------ 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 699b3bbaa05fe..f677a531b9432 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -633,7 +633,7 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): values = values[0] try: - new_values = self._astype(values, dtype, copy=copy) + new_values = astype_block_compat(values, dtype, copy=copy) except (ValueError, TypeError): # e.g. astype_nansafe can fail on object-dtype of strings # trying to convert to float @@ -652,25 +652,6 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): ) return newb - @staticmethod - def _astype(values, dtype: DtypeObj, copy: bool) -> ArrayLike: - - if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): - return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) - - if is_dtype_equal(values.dtype, dtype): - if copy: - return values.copy() - return values - - if isinstance(values, ExtensionArray): - values = values.astype(dtype, copy=copy) - - else: - values = astype_nansafe(values, dtype, copy=copy) - - return values - def convert( self, copy: bool = True, @@ -2599,3 +2580,24 @@ def _extract_bool_array(mask: ArrayLike) -> np.ndarray: assert isinstance(mask, np.ndarray), type(mask) assert mask.dtype == bool, mask.dtype return mask + + +def astype_block_compat(values: ArrayLike, dtype: DtypeObj, copy: bool) -> ArrayLike: + """ + Series/DataFrame implementation of .astype + """ + if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): + return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) + + if is_dtype_equal(values.dtype, dtype): + if copy: + return values.copy() + return values + + if isinstance(values, ExtensionArray): + values = values.astype(dtype, copy=copy) + + else: + values = astype_nansafe(values, dtype, copy=copy) + + return values diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 2688998096a61..2736e02451bc8 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -25,21 +25,16 @@ def make_data(with_nas: bool = True): - if with_nas: - return ( - list(np.arange(0.1, 0.9, 0.1)) - + [pd.NA] - + list(np.arange(1, 9.8, 0.1)) - + [pd.NA] - + [9.9, 10.0] - ) - # case without pd.NA that can be cast to floating ndarray losslessly + na_value = pd.NA if with_nas else np.nan + # `not with_nas` -> case without pd.NA that can be cast to + # floating ndarray losslessly # TODO: get cases with np.nan instead of pd.NA GH#39039 + return ( list(np.arange(0.1, 0.9, 0.1)) - + [np.nan] + + [na_value] + list(np.arange(1, 9.8, 0.1)) - + [np.nan] + + [na_value] + [9.9, 10.0] ) From 7aeb2b5ea62859b935caa504738daed6c9d0b6ac Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 20 Jan 2021 15:54:05 -0800 Subject: [PATCH 5/7] typo fixup+test --- pandas/core/dtypes/missing.py | 2 +- pandas/tests/indexing/test_loc.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 6bf696f39d317..344b7fcb4e4a9 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -525,7 +525,7 @@ def infer_fill_value(val, length: int): dtype = val.dtype cls = dtype.construct_array_type() - return cls._from_sequence([dtype._na_value], dtype=dtype).repeat(length) + return cls._from_sequence([dtype.na_value], dtype=dtype).repeat(length) val = np.array(val, copy=False) if needs_i8_conversion(val.dtype): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 8b13bafdd012f..dd004bb17961d 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -18,6 +18,7 @@ DatetimeIndex, Index, MultiIndex, + NaT, Series, SparseDtype, Timedelta, @@ -1344,6 +1345,19 @@ def test_loc_setitem_categorical_column_retains_dtype(self, ordered): expected = DataFrame({"A": [1], "B": Categorical(["b"], ordered=ordered)}) tm.assert_frame_equal(result, expected) + def test_loc_setitem_ea_not_full_column(self): + # GH#39163 + df = DataFrame({"A": range(5)}) + + val = date_range("2016-01-01", periods=3, tz="US/Pacific") + + df.loc[[0, 1, 2], "B"] = val + + bex = val.append(DatetimeIndex([NaT, NaT], dtype=val.dtype)) + expected = DataFrame({"A": range(5), "B": bex}) + assert expected.dtypes["B"] == val.dtype + tm.assert_frame_equal(df, expected) + class TestLocCallable: def test_frame_loc_getitem_callable(self): From 6ab44afd6ce90308c8d1dbc632acfa128d56d3dd Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 28 Jan 2021 07:46:33 -0800 Subject: [PATCH 6/7] docstring --- pandas/core/dtypes/missing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 344b7fcb4e4a9..6e0b5edce3eb7 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -508,9 +508,10 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool: def infer_fill_value(val, length: int): """ - infer the fill value for the nan/NaT from the provided - scalar/ndarray/list-like if we are a NaT, return the correct dtyped - element to provide proper block construction + `val` is going to be inserted as (part of) a new column in a DataFrame + with the given length. If val cannot be made to fit exactly, + find an appropriately-dtyped NA value to construct a complete column from, + which we will later set `val` into. """ if not is_list_like(val): val = [val] From 450bf73e767b19f9ce1b81eaf2c9932eb66e3f42 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 29 Jan 2021 15:40:12 -0800 Subject: [PATCH 7/7] comment, revert floating --- pandas/tests/extension/test_floating.py | 18 ++++++------------ pandas/tests/extension/test_integer.py | 1 + 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index a6bde3a90934f..e6ab3a4de86f6 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -25,17 +25,12 @@ from pandas.tests.extension import base -def make_data(with_nas: bool = True): - na_value = pd.NA if with_nas else np.nan - # `not with_nas` -> case without pd.NA that can be cast to - # floating ndarray losslessly - # TODO: get cases with np.nan instead of pd.NA GH#39039 - +def make_data(): return ( list(np.arange(0.1, 0.9, 0.1)) - + [na_value] + + [pd.NA] + list(np.arange(1, 9.8, 0.1)) - + [na_value] + + [pd.NA] + [9.9, 10.0] ) @@ -45,10 +40,9 @@ def dtype(request): return request.param() -@pytest.fixture(params=[True, False]) -def data(dtype, request): - with_nas = request.param - return pd.array(make_data(with_nas), dtype=dtype) +@pytest.fixture +def data(dtype): + return pd.array(make_data(), dtype=dtype) @pytest.fixture diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 4b25e1850c3c5..fc0b34e05dce8 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -199,6 +199,7 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): def test_setitem_series(self, data, full_indexer): # https://github.com/pandas-dev/pandas/issues/32395 + # overriden because we have a different `expected` in some cases ser = expected = pd.Series(data, name="data") result = pd.Series(index=ser.index, dtype=object, name="data")