From c331ba8ce586839e1405c70d18f3142506d03193 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Mar 2021 22:48:08 +0100 Subject: [PATCH] [ArrayManager] TST: run (+fix/skip) pandas/tests/frame/indexing tests (#40323) --- .github/workflows/ci.yml | 9 +-- pandas/core/internals/array_manager.py | 6 +- pandas/core/internals/base.py | 7 +++ pandas/tests/frame/indexing/test_indexing.py | 63 ++++++++++++++------ pandas/tests/frame/indexing/test_insert.py | 10 +++- pandas/tests/frame/indexing/test_setitem.py | 26 +++++--- pandas/tests/frame/indexing/test_xs.py | 36 +++++++++-- 7 files changed, 111 insertions(+), 46 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8a5b389968aaa..a395fa8ce3b9e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -155,10 +155,7 @@ jobs: run: | source activate pandas-dev - pytest pandas/tests/frame/methods - pytest pandas/tests/frame/test_constructors.py - pytest pandas/tests/frame/test_* - pytest pandas/tests/frame/test_reductions.py + pytest pandas/tests/frame/ pytest pandas/tests/reductions/ pytest pandas/tests/generic/test_generic.py pytest pandas/tests/arithmetic/ @@ -170,10 +167,6 @@ jobs: pytest pandas/tests/series/test_* # indexing subset (temporary since other tests don't pass yet) - pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean - pytest pandas/tests/frame/indexing/test_where.py - pytest pandas/tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_setitem_multi_index - pytest pandas/tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_setitem_listlike_indexer_duplicate_columns pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_astype_assignment_with_dups pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_frame_setitem_multi_column diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index f8df140e6bf9e..744d8e187e9a7 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -861,7 +861,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): # DataFrame into 1D array when loc is an integer if isinstance(value, np.ndarray) and value.ndim == 2: assert value.shape[1] == 1 - value = value[0, :] + value = value[:, 0] # TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item # but we should avoid that and pass directly the proper array @@ -1163,10 +1163,6 @@ def axes(self): def index(self) -> Index: return self._axes[0] - @property - def array(self): - return self.arrays[0] - @property def dtype(self): return self.array.dtype diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index c5bb2283d23e4..7afb6e5d7e544 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -126,6 +126,13 @@ def isna(self: T, func) -> T: class SingleDataManager(DataManager): ndim = 1 + @property + def array(self): + """ + Quick access to the backing array of the Block or SingleArrayManager. + """ + return self.arrays[0] # type: ignore[attr-defined] + def interleaved_dtype(dtypes: List[DtypeObj]) -> Optional[DtypeObj]: """ diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 9370f97f8310b..928b42b915b18 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -8,6 +8,7 @@ import pytest from pandas._libs import iNaT +import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer @@ -534,6 +535,7 @@ def test_getitem_setitem_integer_slice_keyerrors(self): with pytest.raises(KeyError, match=r"^3$"): df2.loc[3:11] = 0 + @td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): sliced = float_string_frame.iloc[:, -3:] assert sliced["D"].dtype == np.float64 @@ -592,6 +594,7 @@ def test_getitem_fancy_scalar(self, float_frame): for idx in f.index[::5]: assert ix[idx, col] == ts[idx] + @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values def test_setitem_fancy_scalar(self, float_frame): f = float_frame expected = float_frame.copy() @@ -631,6 +634,7 @@ def test_getitem_fancy_boolean(self, float_frame): expected = f.reindex(index=f.index[boolvec], columns=["C", "D"]) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values def test_setitem_fancy_boolean(self, float_frame): # from 2d, set with booleans frame = float_frame.copy() @@ -990,21 +994,29 @@ def test_iloc_row(self): expected = df.loc[8:14] tm.assert_frame_equal(result, expected) + # list of integers + result = df.iloc[[1, 2, 4, 6]] + expected = df.reindex(df.index[[1, 2, 4, 6]]) + tm.assert_frame_equal(result, expected) + + def test_iloc_row_slice_view(self, using_array_manager): + df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2)) + original = df.copy() + # verify slice is view # setting it makes it raise/warn + subset = df.iloc[slice(4, 8)] + msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - result[2] = 0.0 + subset[2] = 0.0 - exp_col = df[2].copy() - exp_col[4:8] = 0.0 + exp_col = original[2].copy() + # TODO(ArrayManager) verify it is expected that the original didn't change + if not using_array_manager: + exp_col[4:8] = 0.0 tm.assert_series_equal(df[2], exp_col) - # list of integers - result = df.iloc[[1, 2, 4, 6]] - expected = df.reindex(df.index[[1, 2, 4, 6]]) - tm.assert_frame_equal(result, expected) - def test_iloc_col(self): df = DataFrame(np.random.randn(4, 10), columns=range(0, 20, 2)) @@ -1022,19 +1034,32 @@ def test_iloc_col(self): expected = df.loc[:, 8:14] tm.assert_frame_equal(result, expected) - # verify slice is view - # and that we are setting a copy - msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): - result[8] = 0.0 - - assert (df[8] == 0).all() - # list of integers result = df.iloc[:, [1, 2, 4, 6]] expected = df.reindex(columns=df.columns[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) + def test_iloc_col_slice_view(self, using_array_manager): + df = DataFrame(np.random.randn(4, 10), columns=range(0, 20, 2)) + original = df.copy() + subset = df.iloc[:, slice(4, 8)] + + if not using_array_manager: + # verify slice is view + # and that we are setting a copy + msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + subset[8] = 0.0 + + assert (df[8] == 0).all() + else: + # TODO(ArrayManager) verify this is the desired behaviour + subset[8] = 0.0 + # subset changed + assert (subset[8] == 0).all() + # but df itself did not change (setitem replaces full column) + tm.assert_frame_equal(df, original) + def test_loc_duplicates(self): # gh-17105 @@ -1218,7 +1243,7 @@ def test_setitem(self, uint64_frame): ) -def test_object_casting_indexing_wraps_datetimelike(): +def test_object_casting_indexing_wraps_datetimelike(using_array_manager): # GH#31649, check the indexing methods all the way down the stack df = DataFrame( { @@ -1240,6 +1265,10 @@ def test_object_casting_indexing_wraps_datetimelike(): assert isinstance(ser.values[1], Timestamp) assert isinstance(ser.values[2], pd.Timedelta) + if using_array_manager: + # remainder of the test checking BlockManager internals + return + mgr = df._mgr mgr._rebuild_blknos_and_blklocs() arr = mgr.fast_xs(0) diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 921d277e09230..4f5ec8eff29a6 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -72,11 +72,17 @@ def test_insert_with_columns_dups(self): ) tm.assert_frame_equal(df, exp) - def test_insert_item_cache(self): + def test_insert_item_cache(self, using_array_manager): df = DataFrame(np.random.randn(4, 3)) ser = df[0] - with tm.assert_produces_warning(PerformanceWarning): + if using_array_manager: + expected_warning = None + else: + # with BlockManager warn about high fragmentation of single dtype + expected_warning = PerformanceWarning + + with tm.assert_produces_warning(expected_warning): for n in range(100): df[n + 3] = df[1] * n diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 10f103343a8b4..661df8a792c65 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.base import registry as ea_registry from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -298,12 +300,12 @@ def test_setitem_dt64tz(self, timezone_frame): # assert that A & C are not sharing the same base (e.g. they # are copies) - b1 = df._mgr.blocks[1] - b2 = df._mgr.blocks[2] - tm.assert_extension_array_equal(b1.values, b2.values) - b1base = b1.values._data.base - b2base = b2.values._data.base - assert b1base is None or (id(b1base) != id(b2base)) + v1 = df._mgr.arrays[1] + v2 = df._mgr.arrays[2] + tm.assert_extension_array_equal(v1, v2) + v1base = v1._data.base + v2base = v2._data.base + assert v1base is None or (id(v1base) != id(v2base)) # with nan df2 = df.copy() @@ -366,7 +368,7 @@ def test_setitem_frame_length_0_str_key(self, indexer): expected["A"] = expected["A"].astype("object") tm.assert_frame_equal(df, expected) - def test_setitem_frame_duplicate_columns(self): + def test_setitem_frame_duplicate_columns(self, using_array_manager): # GH#15695 cols = ["A", "B", "C"] * 2 df = DataFrame(index=range(3), columns=cols) @@ -382,6 +384,11 @@ def test_setitem_frame_duplicate_columns(self): columns=cols, dtype="object", ) + if using_array_manager: + # setitem replaces column so changes dtype + expected["C"] = expected["C"].astype("int64") + # TODO(ArrayManager) .loc still overwrites + expected["B"] = expected["B"].astype("int64") tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("cols", [["a", "b", "c"], ["a", "a", "a"]]) @@ -628,6 +635,8 @@ def test_setitem_object_array_of_tzaware_datetimes(self, idx, expected): class TestDataFrameSetItemWithExpansion: + # TODO(ArrayManager) update parent (_maybe_update_cacher) + @td.skip_array_manager_not_yet_implemented def test_setitem_listlike_views(self): # GH#38148 df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]}) @@ -699,7 +708,7 @@ def test_setitem_with_expansion_categorical_dtype(self): result1 = df["D"] result2 = df["E"] - tm.assert_categorical_equal(result1._mgr._block.values, cat) + tm.assert_categorical_equal(result1._mgr.array, cat) # sorting ser.name = "E" @@ -767,6 +776,7 @@ def inc(x): class TestDataFrameSetItemBooleanMask: + @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values @pytest.mark.parametrize( "mask_type", [lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values], diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index 59c88af265c08..57ea6a63f86e8 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -109,14 +109,22 @@ def test_xs_keep_level(self): result = df.xs([2008, "sat"], level=["year", "day"], drop_level=False) tm.assert_frame_equal(result, expected) - def test_xs_view(self): + def test_xs_view(self, using_array_manager): # in 0.14 this will return a view if possible a copy otherwise, but # this is numpy dependent dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5)) - dm.xs(2)[:] = 10 - assert (dm.xs(2) == 10).all() + if using_array_manager: + # INFO(ArrayManager) with ArrayManager getting a row as a view is + # not possible + msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + dm.xs(2)[:] = 20 + assert not (dm.xs(2) == 20).any() + else: + dm.xs(2)[:] = 20 + assert (dm.xs(2) == 20).all() class TestXSWithMultiIndex: @@ -327,10 +335,26 @@ def test_xs_droplevel_false(self): expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) - def test_xs_droplevel_false_view(self): + def test_xs_droplevel_false_view(self, using_array_manager): # GH#37832 df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) - df.values[0, 0] = 2 - expected = DataFrame({"a": [2]}) + # check that result still views the same data as df + assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values) + # modifying original df also modifies result when having a single block + df.iloc[0, 0] = 2 + if not using_array_manager: + expected = DataFrame({"a": [2]}) + else: + # TODO(ArrayManager) iloc does not update the array inplace using + # "split" path + expected = DataFrame({"a": [1]}) + tm.assert_frame_equal(result, expected) + + # with mixed dataframe, modifying the parent doesn't modify result + # TODO the "split" path behaves differently here as with single block + df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"])) + result = df.xs("a", axis=1, drop_level=False) + df.iloc[0, 0] = 2 + expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected)