Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP [ArrayManager] API: setitem to set new columns / loc+iloc to update inplace #39578

Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,4 @@ jobs:
source activate pandas-dev
pytest pandas/tests/frame/methods --array-manager
pytest pandas/tests/frame/indexing --array-manager
pytest pandas/tests/indexing --array-manager
5 changes: 4 additions & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,11 +382,14 @@ def __len__(self):
# Indices
# ----------------------------------------------------------------
@pytest.fixture
def multiindex_year_month_day_dataframe_random_data():
def multiindex_year_month_day_dataframe_random_data(using_array_manager):
"""
DataFrame with 3 level MultiIndex (year, month, day) covering
first 100 business days from 2000-01-01 with random data
"""
if using_array_manager:
# TODO(ArrayManager) groupby
pytest.skip("Not yet implemented for ArrayManager")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can the decorator be used for this?

tdf = tm.makeTimeDataFrame(100)
ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum()
# use Int64Index, to make sure things work
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,8 @@ def length_of_indexer(indexer, target=None) -> int:
start, stop = stop + 1, start + 1
step = -step
return (stop - start + step - 1) // step
elif isinstance(indexer, (ABCSeries, ABCIndex, np.ndarray, list)):
if isinstance(indexer, list):
elif isinstance(indexer, (ABCSeries, ABCIndex, np.ndarray, list, range)):
if isinstance(indexer, (list, range)):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the range case can be implemented separately + more efficiently, similar to the slice case

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean that a range object should have been converted to a slice before calling length_of_indexer?

At the moment it's simply the _setitem_with_indexer_split_path case that calls that method.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, but it's probably because of calling self.iloc._setitem_with_indexer(..) in frame.py instead of self.iloc[..] = .. that the conversion doesn't happen anymore (iloc.__setitem__ calls indexer = self._get_setitem_indexer(key) before passing the indexer to _setitem_with_indexer and the _get_setitem_indexer converts a range object in a list.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i was just thinking (indexer.start - indexer.stop) // indexer.step

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this not work?

indexer = np.array(indexer)

if indexer.dtype == bool:
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,7 +642,14 @@ def fast_xs(self, loc: int) -> ArrayLike:
else:
temp_dtype = dtype

result = np.array([arr[loc] for arr in self.arrays], dtype=temp_dtype)
if dtype == "object":
# TODO properly test this, check
# pandas/tests/indexing/test_chaining_and_caching.py::TestChaining
# ::test_chained_getitem_with_lists
result = np.empty(self.shape_proper[1], dtype=dtype)
result[:] = [arr[loc] for arr in self.arrays]
else:
result = np.array([arr[loc] for arr in self.arrays], dtype=temp_dtype)
if isinstance(dtype, ExtensionDtype):
result = dtype.construct_array_type()._from_sequence(result, dtype=dtype)
return result
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/indexing/multiindex/test_chaining_and_caching.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas import DataFrame, MultiIndex, Series
import pandas._testing as tm
import pandas.core.common as com
Expand Down Expand Up @@ -28,6 +30,7 @@ def test_detect_chained_assignment():
zed["eyes"]["right"].fillna(value=555, inplace=True)


@td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view
def test_cache_updating():
# 5216
# make sure that we don't try to set a dead cache
Expand Down
17 changes: 16 additions & 1 deletion pandas/tests/indexing/multiindex/test_setitem.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, isna, notna
import pandas._testing as tm
Expand Down Expand Up @@ -116,6 +118,9 @@ def check(target, indexers, value, compare_fn, expected=None):
expected=copy,
)

# TODO(ArrayManager) df.loc["bar"] *= 2 doesn't raise an error but results in
# all NaNs -> doesn't work in the "split" path (also for BlockManager actually)
@td.skip_array_manager_not_yet_implemented
def test_multiindex_setitem(self):

# GH 3738
Expand Down Expand Up @@ -170,7 +175,7 @@ def test_multiindex_setitem(self):
df.loc[idx[:, :, "Stock"], "price"] *= 2
tm.assert_frame_equal(df, expected)

def test_multiindex_assignment(self):
def test_multiindex_assignment(self, using_array_manager):

# GH3777 part 2

Expand All @@ -195,11 +200,15 @@ def test_multiindex_assignment(self):

df.loc[4, "c"] = arr
exp = Series(arr, index=[8, 10], name="c", dtype="float64")
if using_array_manager:
exp = exp.astype("int64")
tm.assert_series_equal(df.loc[4, "c"], exp)

# scalar ok
df.loc[4, "c"] = 10
exp = Series(10, index=[8, 10], name="c", dtype="float64")
if using_array_manager:
exp = exp.astype("int64")
tm.assert_series_equal(df.loc[4, "c"], exp)

# invalid assignments
Expand Down Expand Up @@ -312,6 +321,8 @@ def test_frame_getitem_setitem_multislice(self):
df.loc[:, :] = 10
tm.assert_frame_equal(df, result)

# TODO(ArrayManager) iset with multiple elements not yet implemented
@td.skip_array_manager_not_yet_implemented
def test_frame_setitem_multi_column(self):
df = DataFrame(
np.random.randn(10, 4), columns=[["a", "a", "b", "b"], [0, 1, 0, 1]]
Expand Down Expand Up @@ -417,6 +428,8 @@ def test_nonunique_assignment_1750(self):

assert (df.xs((1, 1))["C"] == "_").all()

# TODO(ArrayManager) iset with multiple elements not yet implemented
@td.skip_array_manager_not_yet_implemented
def test_astype_assignment_with_dups(self):

# GH 4686
Expand All @@ -439,6 +452,8 @@ def test_setitem_nonmonotonic(self):
tm.assert_frame_equal(df, expected)


@td.skip_array_manager_invalid_test # df["foo"] select multiple columns -> .values
# is not a view
def test_frame_setitem_view_direct(multiindex_dataframe_random_data):
# this works because we are modifying the underlying array
# really a no-no
Expand Down
17 changes: 11 additions & 6 deletions pandas/tests/indexing/test_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,19 @@
import pandas._testing as tm


def test_at_timezone():
def test_at_timezone(using_array_manager):
# https://github.com/pandas-dev/pandas/issues/33544
result = DataFrame({"foo": [datetime(2000, 1, 1)]})
result.at[0, "foo"] = datetime(2000, 1, 2, tzinfo=timezone.utc)
expected = DataFrame(
{"foo": [datetime(2000, 1, 2, tzinfo=timezone.utc)]}, dtype=object
)
tm.assert_frame_equal(result, expected)
if using_array_manager:
# TODO(ArrayManager) this should give a better error message
with pytest.raises(TypeError, match="tz-naive and tz-aware"):
result.at[0, "foo"] = datetime(2000, 1, 2, tzinfo=timezone.utc)
else:
result.at[0, "foo"] = datetime(2000, 1, 2, tzinfo=timezone.utc)
expected = DataFrame(
{"foo": [datetime(2000, 1, 2, tzinfo=timezone.utc)]}, dtype=object
)
tm.assert_frame_equal(result, expected)


class TestAtSetItem:
Expand Down
39 changes: 26 additions & 13 deletions pandas/tests/indexing/test_chaining_and_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def test_setitem_chained_setfault(self):
tm.assert_frame_equal(result, expected)

@pytest.mark.arm_slow
def test_detect_chained_assignment(self):
def test_detect_chained_assignment(self, using_array_manager):

pd.set_option("chained_assignment", "raise")

Expand All @@ -150,19 +150,25 @@ def test_detect_chained_assignment(self):
df = DataFrame(
{
"A": Series(range(2), dtype="int64"),
"B": np.array(np.arange(2, 4), dtype=np.float64),
"B": np.array(np.arange(1, 4, 2), dtype=np.float64),
}
)
assert df._is_copy is None

msg = "A value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
df["A"][0] = -5
if not using_array_manager:
with pytest.raises(com.SettingWithCopyError, match=msg):
df["A"][0] = -5

with pytest.raises(com.SettingWithCopyError, match=msg):
df["A"][1] = np.nan
with pytest.raises(com.SettingWithCopyError, match=msg):
df["A"][1] = np.nan

assert df["A"]._is_copy is None
assert df["A"]._is_copy is None
else:
df["A"][0] = -5
df["A"][1] = -6
expected["B"] = expected["B"].astype("float64")
tm.assert_frame_equal(df, expected)

# Using a copy (the chain), fails
df = DataFrame(
Expand Down Expand Up @@ -191,13 +197,17 @@ def test_detect_chained_assignment(self):
expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]})
df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]})

with pytest.raises(com.SettingWithCopyError, match=msg):
if not using_array_manager:
with pytest.raises(com.SettingWithCopyError, match=msg):
df["A"][0] = 111

df.loc[0, "A"] = 111
else:
df["A"][0] = 111

with pytest.raises(com.SettingWithCopyError, match=msg):
df.loc[0]["A"] = 111

df.loc[0, "A"] = 111
tm.assert_frame_equal(df, expected)

# gh-5475: Make sure that is_copy is picked up reconstruction
Expand Down Expand Up @@ -256,10 +266,12 @@ def random_text(nobs=100):
df.loc[:, "letters"] = df["letters"].apply(str.lower)

# Should be ok even though it's a copy!
assert df._is_copy is None
if not using_array_manager:
assert df._is_copy is None

df["letters"] = df["letters"].apply(str.lower)
assert df._is_copy is None
if not using_array_manager:
assert df._is_copy is None

df = random_text(100000)
indexer = df.letters.apply(lambda x: len(x) > 10)
Expand Down Expand Up @@ -313,8 +325,9 @@ def random_text(nobs=100):
with pytest.raises(com.SettingWithCopyError, match=msg):
df.loc[2]["C"] = "foo"

with pytest.raises(com.SettingWithCopyError, match=msg):
df["C"][2] = "foo"
if not using_array_manager:
with pytest.raises(com.SettingWithCopyError, match=msg):
df["C"][2] = "foo"

def test_setting_with_copy_bug(self):

Expand Down
14 changes: 9 additions & 5 deletions pandas/tests/indexing/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


class TestDatetimeIndex:
def test_indexing_with_datetime_tz(self):
def test_indexing_with_datetime_tz(self, using_array_manager):

# GH#8260
# support datetime64 with tz
Expand Down Expand Up @@ -65,10 +65,14 @@ def test_indexing_with_datetime_tz(self):
# trying to set a single element on a part of a different timezone
# this converts to object
df2 = df.copy()
df2.loc[df2.new_col == "new", "time"] = v

expected = Series([v[0], df.loc[1, "time"]], name="time")
tm.assert_series_equal(df2.time, expected)
if using_array_manager:
with pytest.raises(ValueError, match="Timezones don't match"):
df2.loc[df2.new_col == "new", "time"] = v
else:
df2.loc[df2.new_col == "new", "time"] = v

expected = Series([v[0], df.loc[1, "time"]], name="time")
tm.assert_series_equal(df2.time, expected)

v = df.loc[df.new_col == "new", "time"] + pd.Timedelta("1s")
df.loc[df.new_col == "new", "time"] = v
Expand Down
Loading