-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
API: better error-handling for df.set_index #22486
Changes from all commits
28ec3a9
8dd1453
18d3464
fa68660
6d3dfa1
159ecf5
46f8dc7
3a6469d
c3cdde2
dd8d000
47c4e74
06ed33a
9871be2
9da6e6c
361a905
42d5f2a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,7 +49,8 @@ def test_set_index_cast(self): | |
tm.assert_frame_equal(df, df2) | ||
|
||
# A has duplicate values, C does not | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], | ||
('tuple', 'as', 'label')]) | ||
@pytest.mark.parametrize('inplace', [True, False]) | ||
@pytest.mark.parametrize('drop', [True, False]) | ||
def test_set_index_drop_inplace(self, frame_of_index_cols, | ||
|
@@ -72,7 +73,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, | |
tm.assert_frame_equal(result, expected) | ||
|
||
# A has duplicate values, C does not | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], | ||
('tuple', 'as', 'label')]) | ||
@pytest.mark.parametrize('drop', [True, False]) | ||
def test_set_index_append(self, frame_of_index_cols, drop, keys): | ||
df = frame_of_index_cols | ||
|
@@ -88,7 +90,8 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys): | |
tm.assert_frame_equal(result, expected) | ||
|
||
# A has duplicate values, C does not | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) | ||
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], | ||
('tuple', 'as', 'label')]) | ||
@pytest.mark.parametrize('drop', [True, False]) | ||
def test_set_index_append_to_multiindex(self, frame_of_index_cols, | ||
drop, keys): | ||
|
@@ -114,8 +117,10 @@ def test_set_index_after_mutation(self): | |
tm.assert_frame_equal(result, expected) | ||
|
||
# MultiIndex constructor does not work directly on Series -> lambda | ||
# Add list-of-list constructor because list is ambiguous -> lambda | ||
# also test index name if append=True (name is duplicate here for B) | ||
@pytest.mark.parametrize('box', [Series, Index, np.array, | ||
list, tuple, iter, lambda x: [list(x)], | ||
lambda x: MultiIndex.from_arrays([x])]) | ||
@pytest.mark.parametrize('append, index_name', [(True, None), | ||
(True, 'B'), (True, 'test'), (False, None)]) | ||
|
@@ -126,21 +131,29 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, | |
df.index.name = index_name | ||
|
||
key = box(df['B']) | ||
# np.array and list "forget" the name of B | ||
name = [None if box in [np.array, list] else 'B'] | ||
if box == list: | ||
# list of strings gets interpreted as list of keys | ||
msg = "['one', 'two', 'three', 'one', 'two']" | ||
with tm.assert_raises_regex(KeyError, msg): | ||
df.set_index(key, drop=drop, append=append) | ||
else: | ||
# np.array/tuple/iter/list-of-list "forget" the name of B | ||
name_mi = getattr(key, 'names', None) | ||
name = [getattr(key, 'name', None)] if name_mi is None else name_mi | ||
|
||
result = df.set_index(key, drop=drop, append=append) | ||
result = df.set_index(key, drop=drop, append=append) | ||
|
||
# only valid column keys are dropped | ||
# since B is always passed as array above, nothing is dropped | ||
expected = df.set_index(['B'], drop=False, append=append) | ||
expected.index.names = [index_name] + name if append else name | ||
# only valid column keys are dropped | ||
# since B is always passed as array above, nothing is dropped | ||
expected = df.set_index(['B'], drop=False, append=append) | ||
expected.index.names = [index_name] + name if append else name | ||
|
||
tm.assert_frame_equal(result, expected) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# MultiIndex constructor does not work directly on Series -> lambda | ||
# also test index name if append=True (name is duplicate here for A & B) | ||
@pytest.mark.parametrize('box', [Series, Index, np.array, list, | ||
@pytest.mark.parametrize('box', [Series, Index, np.array, | ||
list, tuple, iter, | ||
lambda x: MultiIndex.from_arrays([x])]) | ||
@pytest.mark.parametrize('append, index_name', | ||
[(True, None), (True, 'A'), (True, 'B'), | ||
|
@@ -152,8 +165,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, | |
df.index.name = index_name | ||
|
||
keys = ['A', box(df['B'])] | ||
# np.array and list "forget" the name of B | ||
names = ['A', None if box in [np.array, list] else 'B'] | ||
# np.array/list/tuple/iter "forget" the name of B | ||
names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] | ||
|
||
result = df.set_index(keys, drop=drop, append=append) | ||
|
||
|
@@ -168,10 +181,12 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, | |
# MultiIndex constructor does not work directly on Series -> lambda | ||
# We also emulate a "constructor" for the label -> lambda | ||
# also test index name if append=True (name is duplicate here for A) | ||
@pytest.mark.parametrize('box2', [Series, Index, np.array, list, | ||
@pytest.mark.parametrize('box2', [Series, Index, np.array, | ||
list, tuple, iter, | ||
lambda x: MultiIndex.from_arrays([x]), | ||
lambda x: x.name]) | ||
@pytest.mark.parametrize('box1', [Series, Index, np.array, list, | ||
@pytest.mark.parametrize('box1', [Series, Index, np.array, | ||
list, tuple, iter, | ||
lambda x: MultiIndex.from_arrays([x]), | ||
lambda x: x.name]) | ||
@pytest.mark.parametrize('append, index_name', [(True, None), | ||
|
@@ -183,21 +198,22 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, | |
df.index.name = index_name | ||
|
||
keys = [box1(df['A']), box2(df['A'])] | ||
result = df.set_index(keys, drop=drop, append=append) | ||
|
||
# == gives ambiguous Boolean for Series | ||
if drop and keys[0] is 'A' and keys[1] is 'A': | ||
with tm.assert_raises_regex(KeyError, '.*'): | ||
df.set_index(keys, drop=drop, append=append) | ||
else: | ||
result = df.set_index(keys, drop=drop, append=append) | ||
# if either box was iter, the content has been consumed; re-read it | ||
keys = [box1(df['A']), box2(df['A'])] | ||
|
||
# to test against already-tested behavior, we add sequentially, | ||
# hence second append always True; must wrap in list, otherwise | ||
# list-box will be illegal | ||
expected = df.set_index([keys[0]], drop=drop, append=append) | ||
expected = expected.set_index([keys[1]], drop=drop, append=True) | ||
# need to adapt first drop for case that both keys are 'A' -- | ||
# cannot drop the same column twice; | ||
# use "is" because == would give ambiguous Boolean error for containers | ||
first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop | ||
|
||
tm.assert_frame_equal(result, expected) | ||
# to test against already-tested behaviour, we add sequentially, | ||
# hence second append always True; must wrap keys in list, otherwise | ||
# box = list would be illegal | ||
expected = df.set_index([keys[0]], drop=first_drop, append=append) | ||
expected = expected.set_index([keys[1]], drop=drop, append=True) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('append', [True, False]) | ||
@pytest.mark.parametrize('drop', [True, False]) | ||
|
@@ -229,13 +245,24 @@ def test_set_index_verify_integrity(self, frame_of_index_cols): | |
def test_set_index_raise(self, frame_of_index_cols, drop, append): | ||
df = frame_of_index_cols | ||
|
||
with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E | ||
with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"): | ||
# column names are A-E, as well as one tuple | ||
df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append) | ||
|
||
# non-existent key in list with arrays | ||
with tm.assert_raises_regex(KeyError, '.*'): | ||
with tm.assert_raises_regex(KeyError, 'X'): | ||
df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) | ||
|
||
msg = 'The parameter "keys" may only contain a combination of.*' | ||
# forbidden type, e.g. set | ||
with tm.assert_raises_regex(TypeError, msg): | ||
df.set_index(set(df['A']), drop=drop, append=append) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you singling out an iterable (set) is there some reason? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, just any of the types that are not allowed (I've tried to indicate as much by using "e.g.") |
||
|
||
# forbidden type in list, e.g. set | ||
with tm.assert_raises_regex(TypeError, msg): | ||
df.set_index(['A', df['A'], set(df['A'])], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see my comment above, I am not sure a set is specifically excluded. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am proposing to exclude sets (see above). Irrespective of that, I need to test raising the TypeError for forbidden types (which type exactly is less relevant) |
||
drop=drop, append=append) | ||
|
||
def test_construction_with_categorical_index(self): | ||
ci = tm.makeCategoricalIndex(10) | ||
ci.name = 'B' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why exactly is a list interpreted differently here? this makes this test really really odd. I am worried something changed here, as this appears to work just fine in the master.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You will see above (line 123) that
list
wasn't tested - for precisely this reason.df.set_index(['A', 'B'])
interprets aA
&B
as column keys, so (assuming thisdf
was length 2) it would not use the list['A', 'B']
as the index. To do that, one would have to pass[['A', 'B']]
. This PR proposes to add tests for the current behaviour.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback
In case the above was not very clearly worded, this corresponds exactly to behaviour on
master
: