From 28ec3a91dbb4efd74f311940d354fe06f8fbd061 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 23 Aug 2018 17:15:03 +0200 Subject: [PATCH 1/9] API: better error-handling for df.set_index --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/frame.py | 47 ++++++++++++++++++--------- pandas/tests/frame/test_alter_axes.py | 34 ++++++++++++------- 3 files changed, 55 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 3b61fde77cb9f..da6ed2d2c7adc 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -551,6 +551,7 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) +- :meth:`DataFrame.set_index` now raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) .. _whatsnew_0240.deprecations: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 721c31c57bc06..b4a72706871cb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -60,6 +60,7 @@ is_sequence, is_named_tuple) from pandas.core.dtypes.concat import _get_sliced_frame_result_type +from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex from pandas.core.dtypes.missing import isna, notna @@ -3912,6 +3913,22 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if not isinstance(keys, list): keys = [keys] + missing = [] + for x in keys: + if not (is_scalar(x) or isinstance(x, tuple)): + if not isinstance(x, (ABCSeries, ABCIndexClass, ABCMultiIndex, + list, np.ndarray)): + raise TypeError('keys may only contain a combination of ' + 'the following: valid column keys, ' + 'Series, Index, MultiIndex, list or ' + 'np.ndarray') + else: + if x not in self: + missing.append(x) + + if missing: + raise KeyError('{}'.format(missing)) + if inplace: frame = self else: @@ -3921,7 +3938,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, names = [] if append: names = [x for x in self.index.names] - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): for i in range(self.index.nlevels): arrays.append(self.index._get_level_values(i)) else: @@ -3929,29 +3946,26 @@ def set_index(self, keys, drop=True, append=False, inplace=False, to_remove = [] for col in keys: - if isinstance(col, MultiIndex): - # append all but the last column so we don't have to modify - # the end of this loop - for n in range(col.nlevels - 1): + if isinstance(col, ABCMultiIndex): + for n in range(col.nlevels): arrays.append(col._get_level_values(n)) - - level = col._get_level_values(col.nlevels - 1) names.extend(col.names) - elif isinstance(col, Series): - level = col._values + elif isinstance(col, ABCIndexClass): + # Index but not MultiIndex (treated above) + arrays.append(col) names.append(col.name) - elif isinstance(col, Index): - level = col + elif isinstance(col, ABCSeries): + arrays.append(col._values) names.append(col.name) - elif isinstance(col, (list, np.ndarray, Index)): - level = col + elif isinstance(col, (list, np.ndarray)): + arrays.append(col) names.append(None) + # from here, col can only be a column label else: - level = frame[col]._values + arrays.append(frame[col]._values) names.append(col) if drop: to_remove.append(col) - arrays.append(level) index = ensure_index_from_sequences(arrays, names) @@ -3960,7 +3974,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False, raise ValueError('Index has duplicate keys: {dup}'.format( dup=duplicates)) - for c in to_remove: + # use set to handle duplicate column names gracefully in case of drop + for c in set(to_remove): del frame[c] # clear up memory usage diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 4e61c9c62266d..8c635b50a7c25 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -186,18 +186,19 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, # == gives ambiguous Boolean for Series if drop and keys[0] is 'A' and keys[1] is 'A': - with tm.assert_raises_regex(KeyError, '.*'): - df.set_index(keys, drop=drop, append=append) + # can't drop same column twice + first_drop = False else: - result = df.set_index(keys, drop=drop, append=append) + first_drop = drop - # to test against already-tested behavior, we add sequentially, - # hence second append always True; must wrap in list, otherwise - # list-box will be illegal - expected = df.set_index([keys[0]], drop=drop, append=append) - expected = expected.set_index([keys[1]], drop=drop, append=True) + # to test against already-tested behaviour, we add sequentially, + # hence second append always True; must wrap in list, otherwise + # list-box will be illegal + expected = df.set_index([keys[0]], drop=first_drop, append=append) + expected = expected.set_index([keys[1]], drop=drop, append=True) - tm.assert_frame_equal(result, expected) + result = df.set_index(keys, drop=drop, append=append) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('append', [True, False]) @pytest.mark.parametrize('drop', [True, False]) @@ -229,13 +230,24 @@ def test_set_index_verify_integrity(self, frame_of_index_cols): def test_set_index_raise(self, frame_of_index_cols, drop, append): df = frame_of_index_cols - with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E + with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"): + # column names are A-E df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append) # non-existent key in list with arrays - with tm.assert_raises_regex(KeyError, '.*'): + with tm.assert_raises_regex(KeyError, 'X'): df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) + rgx = 'keys may only contain a combination of the following:.*' + # forbidden type, e.g. set + with tm.assert_raises_regex(TypeError, rgx): + df.set_index(set(df['A']), drop=drop, append=append) + + # forbidden type in list, e.g. set + with tm.assert_raises_regex(TypeError, rgx): + df.set_index(['A', df['A'], set(df['A'])], + drop=drop, append=append) + def test_construction_with_categorical_index(self): ci = tm.makeCategoricalIndex(10) ci.name = 'B' From 8dd1453ad1f2b0a28d54c891ea3322c6d1fd1c34 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 19 Sep 2018 10:53:45 +0200 Subject: [PATCH 2/9] Review (jreback) --- pandas/core/frame.py | 8 ++++---- pandas/tests/frame/test_alter_axes.py | 18 +++++++----------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b4a72706871cb..05f4f3aed7d2b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3918,10 +3918,10 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if not (is_scalar(x) or isinstance(x, tuple)): if not isinstance(x, (ABCSeries, ABCIndexClass, ABCMultiIndex, list, np.ndarray)): - raise TypeError('keys may only contain a combination of ' - 'the following: valid column keys, ' - 'Series, Index, MultiIndex, list or ' - 'np.ndarray') + raise TypeError('The parameter "keys" may only contain a ' + 'combination of the following: valid ' + 'column keys, Series, Index, MultiIndex, ' + 'list or np.ndarray') else: if x not in self: missing.append(x) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 8c635b50a7c25..b0451c28ef446 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -184,17 +184,13 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, keys = [box1(df['A']), box2(df['A'])] - # == gives ambiguous Boolean for Series - if drop and keys[0] is 'A' and keys[1] is 'A': - # can't drop same column twice - first_drop = False - else: - first_drop = drop - # to test against already-tested behaviour, we add sequentially, - # hence second append always True; must wrap in list, otherwise - # list-box will be illegal - expected = df.set_index([keys[0]], drop=first_drop, append=append) + # hence second append always True; must wrap keys in list, otherwise + # box = list would be illegal; need to adapt drop for case that both + # keys are 'A' -- can't drop the same column twice + expected = df.set_index([keys[0]], drop=(False if (keys[0] is 'A' + and keys[1] is 'A') + else drop), append=append) expected = expected.set_index([keys[1]], drop=drop, append=True) result = df.set_index(keys, drop=drop, append=append) @@ -238,7 +234,7 @@ def test_set_index_raise(self, frame_of_index_cols, drop, append): with tm.assert_raises_regex(KeyError, 'X'): df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) - rgx = 'keys may only contain a combination of the following:.*' + rgx = 'The parameter "keys" may only contain a combination of.*' # forbidden type, e.g. set with tm.assert_raises_regex(TypeError, rgx): df.set_index(set(df['A']), drop=drop, append=append) From 18d3464e021a5d2613e183bc0c8823c736d03d80 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 23 Sep 2018 23:56:20 +0200 Subject: [PATCH 3/9] Review (jreback) --- pandas/tests/frame/test_alter_axes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index b0451c28ef446..2f508f9725283 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -234,13 +234,13 @@ def test_set_index_raise(self, frame_of_index_cols, drop, append): with tm.assert_raises_regex(KeyError, 'X'): df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) - rgx = 'The parameter "keys" may only contain a combination of.*' + msg = 'The parameter "keys" may only contain a combination of.*' # forbidden type, e.g. set - with tm.assert_raises_regex(TypeError, rgx): + with tm.assert_raises_regex(TypeError, msg): df.set_index(set(df['A']), drop=drop, append=append) # forbidden type in list, e.g. set - with tm.assert_raises_regex(TypeError, rgx): + with tm.assert_raises_regex(TypeError, msg): df.set_index(['A', df['A'], set(df['A'])], drop=drop, append=append) From fa68660f8feed9d6d084c00b084fc8fe466a7e8d Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 26 Sep 2018 18:31:37 +0200 Subject: [PATCH 4/9] Allow list-likes (review jreback) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/frame.py | 38 ++++++++++-------- pandas/tests/frame/conftest.py | 7 ++-- pandas/tests/frame/test_alter_axes.py | 55 ++++++++++++++++++--------- 4 files changed, 63 insertions(+), 39 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index da6ed2d2c7adc..549c8c5d98360 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -551,7 +551,7 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- :meth:`DataFrame.set_index` now raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) +- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) .. _whatsnew_0240.deprecations: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 05f4f3aed7d2b..13887c2c3b4f1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -60,7 +60,8 @@ is_sequence, is_named_tuple) from pandas.core.dtypes.concat import _get_sliced_frame_result_type -from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex +from pandas.core.dtypes.generic import (ABCSeries, ABCIndexClass, + ABCMultiIndex, ABCDataFrame) from pandas.core.dtypes.missing import isna, notna @@ -3915,16 +3916,18 @@ def set_index(self, keys, drop=True, append=False, inplace=False, missing = [] for x in keys: - if not (is_scalar(x) or isinstance(x, tuple)): - if not isinstance(x, (ABCSeries, ABCIndexClass, ABCMultiIndex, - list, np.ndarray)): - raise TypeError('The parameter "keys" may only contain a ' - 'combination of the following: valid ' - 'column keys, Series, Index, MultiIndex, ' - 'list or np.ndarray') - else: - if x not in self: - missing.append(x) + if (is_scalar(x) or isinstance(x, tuple)) and x in self: + continue + elif is_scalar(x) and x not in self: + # a tuple gets tried as column key first; + # otherwise considered as a list-like; i.e. not missing + missing.append(x) + elif (not is_list_like(x) or isinstance(x, set) + or isinstance(x, ABCDataFrame) + or (isinstance(x, np.ndarray) and x.ndim > 1)): + raise TypeError('The parameter "keys" may only contain a ' + 'combination of valid column keys and ' + 'one-dimensional list-likes') if missing: raise KeyError('{}'.format(missing)) @@ -3950,16 +3953,19 @@ def set_index(self, keys, drop=True, append=False, inplace=False, for n in range(col.nlevels): arrays.append(col._get_level_values(n)) names.extend(col.names) - elif isinstance(col, ABCIndexClass): - # Index but not MultiIndex (treated above) + elif isinstance(col, (ABCIndexClass, ABCSeries)): + # if Index then not MultiIndex (treated above) arrays.append(col) names.append(col.name) - elif isinstance(col, ABCSeries): - arrays.append(col._values) - names.append(col.name) elif isinstance(col, (list, np.ndarray)): arrays.append(col) names.append(None) + elif (is_list_like(col) + and not (isinstance(col, tuple) and col in self)): + # all other list-likes (but avoid valid column keys) + col = list(col) # ensure iterator do not get read twice etc. + arrays.append(col) + names.append(None) # from here, col can only be a column label else: arrays.append(frame[col]._values) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index fdedb93835d75..4e8fa849b9ccb 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -180,12 +180,13 @@ def frame_of_index_cols(): """ Fixture for DataFrame of columns that can be used for indexing - Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but - are jointly unique), the rest are unique. + Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; + 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. """ df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], 'B': ['one', 'two', 'three', 'one', 'two'], 'C': ['a', 'b', 'c', 'd', 'e'], 'D': np.random.randn(5), - 'E': np.random.randn(5)}) + 'E': np.random.randn(5), + ('tuple', 'as', 'label'): np.random.randn(5)}) return df diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 2f508f9725283..526b9e181dac2 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -49,7 +49,8 @@ def test_set_index_cast(self): tm.assert_frame_equal(df, df2) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) + @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], + ('tuple', 'as', 'label')]) @pytest.mark.parametrize('inplace', [True, False]) @pytest.mark.parametrize('drop', [True, False]) def test_set_index_drop_inplace(self, frame_of_index_cols, @@ -72,7 +73,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, tm.assert_frame_equal(result, expected) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) + @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], + ('tuple', 'as', 'label')]) @pytest.mark.parametrize('drop', [True, False]) def test_set_index_append(self, frame_of_index_cols, drop, keys): df = frame_of_index_cols @@ -88,7 +90,8 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys): tm.assert_frame_equal(result, expected) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) + @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], + ('tuple', 'as', 'label')]) @pytest.mark.parametrize('drop', [True, False]) def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys): @@ -114,8 +117,10 @@ def test_set_index_after_mutation(self): tm.assert_frame_equal(result, expected) # MultiIndex constructor does not work directly on Series -> lambda + # Add list-of-list constructor because list is ambiguous -> lambda # also test index name if append=True (name is duplicate here for B) @pytest.mark.parametrize('box', [Series, Index, np.array, + list, tuple, iter, lambda x: [list(x)], lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'B'), (True, 'test'), (False, None)]) @@ -126,21 +131,29 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, df.index.name = index_name key = box(df['B']) - # np.array and list "forget" the name of B - name = [None if box in [np.array, list] else 'B'] + if box == list: + # list of strings gets interpreted as list of keys + msg = "['one', 'two', 'three', 'one', 'two']" + with tm.assert_raises_regex(KeyError, msg): + df.set_index(key, drop=drop, append=append) + else: + # np.array/tuple/iter/list-of-list "forget" the name of B + name_mi = getattr(key, 'names', None) + name = [getattr(key, 'name', None)] if name_mi is None else name_mi - result = df.set_index(key, drop=drop, append=append) + result = df.set_index(key, drop=drop, append=append) - # only valid column keys are dropped - # since B is always passed as array above, nothing is dropped - expected = df.set_index(['B'], drop=False, append=append) - expected.index.names = [index_name] + name if append else name + # only valid column keys are dropped + # since B is always passed as array above, nothing is dropped + expected = df.set_index(['B'], drop=False, append=append) + expected.index.names = [index_name] + name if append else name - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # MultiIndex constructor does not work directly on Series -> lambda # also test index name if append=True (name is duplicate here for A & B) - @pytest.mark.parametrize('box', [Series, Index, np.array, list, + @pytest.mark.parametrize('box', [Series, Index, np.array, + list, tuple, iter, lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'A'), (True, 'B'), @@ -152,8 +165,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, df.index.name = index_name keys = ['A', box(df['B'])] - # np.array and list "forget" the name of B - names = ['A', None if box in [np.array, list] else 'B'] + # np.array/list/tuple/iter "forget" the name of B + names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] result = df.set_index(keys, drop=drop, append=append) @@ -168,10 +181,12 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # We also emulate a "constructor" for the label -> lambda # also test index name if append=True (name is duplicate here for A) - @pytest.mark.parametrize('box2', [Series, Index, np.array, list, + @pytest.mark.parametrize('box2', [Series, Index, np.array, + list, tuple, iter, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) - @pytest.mark.parametrize('box1', [Series, Index, np.array, list, + @pytest.mark.parametrize('box1', [Series, Index, np.array, + list, tuple, iter, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) @pytest.mark.parametrize('append, index_name', [(True, None), @@ -182,6 +197,10 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, df = frame_of_index_cols df.index.name = index_name + keys = [box1(df['A']), box2(df['A'])] + result = df.set_index(keys, drop=drop, append=append) + + # if either box was iter, the content has been consumed; re-read it keys = [box1(df['A']), box2(df['A'])] # to test against already-tested behaviour, we add sequentially, @@ -192,8 +211,6 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, and keys[1] is 'A') else drop), append=append) expected = expected.set_index([keys[1]], drop=drop, append=True) - - result = df.set_index(keys, drop=drop, append=append) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('append', [True, False]) @@ -227,7 +244,7 @@ def test_set_index_raise(self, frame_of_index_cols, drop, append): df = frame_of_index_cols with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"): - # column names are A-E + # column names are A-E, as well as one tuple df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append) # non-existent key in list with arrays From 3a6469dd9e305802bf1b015c27ad0ae57763efa7 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 3 Oct 2018 17:56:41 +0200 Subject: [PATCH 5/9] Review (jreback) --- pandas/core/frame.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bf959390c6e71..36c36929d4a1a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -60,8 +60,7 @@ is_sequence, is_named_tuple) from pandas.core.dtypes.concat import _get_sliced_frame_result_type -from pandas.core.dtypes.generic import (ABCSeries, ABCIndexClass, - ABCMultiIndex, ABCDataFrame) +from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex from pandas.core.dtypes.missing import isna, notna @@ -3955,16 +3954,17 @@ def set_index(self, keys, drop=True, append=False, inplace=False, keys = [keys] missing = [] - for x in keys: - if (is_scalar(x) or isinstance(x, tuple)) and x in self: + for col in keys: + if (is_scalar(col) or isinstance(col, tuple)) and col in self: + # tuples can be both column keys or list-likes + # if they are valid column keys, everything is fine continue - elif is_scalar(x) and x not in self: - # a tuple gets tried as column key first; - # otherwise considered as a list-like; i.e. not missing - missing.append(x) - elif (not is_list_like(x) or isinstance(x, set) - or isinstance(x, ABCDataFrame) - or (isinstance(x, np.ndarray) and x.ndim > 1)): + elif is_scalar(col) and col not in self: + # tuples that are not column keys are considered list-like, + # not considered missing + missing.append(col) + elif (not is_list_like(col) or isinstance(col, set) + or getattr(col, 'ndim', 1) > 1): raise TypeError('The parameter "keys" may only contain a ' 'combination of valid column keys and ' 'one-dimensional list-likes') From dd8d0004fd5154619331894c982fbc3a145c70aa Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 5 Oct 2018 18:44:37 +0200 Subject: [PATCH 6/9] Review (jreback) --- pandas/tests/frame/test_alter_axes.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 526b9e181dac2..1b505b965dd65 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -203,13 +203,14 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, # if either box was iter, the content has been consumed; re-read it keys = [box1(df['A']), box2(df['A'])] + # need to adapt first drop for case that both keys are 'A' -- + # can't drop the same column twice + first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop + # to test against already-tested behaviour, we add sequentially, # hence second append always True; must wrap keys in list, otherwise - # box = list would be illegal; need to adapt drop for case that both - # keys are 'A' -- can't drop the same column twice - expected = df.set_index([keys[0]], drop=(False if (keys[0] is 'A' - and keys[1] is 'A') - else drop), append=append) + # box = list would be illegal + expected = df.set_index([keys[0]], drop=first_drop, append=append) expected = expected.set_index([keys[1]], drop=drop, append=True) tm.assert_frame_equal(result, expected) From 47c4e74788c7194fa9864df3876149e7773314cc Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 8 Oct 2018 18:44:02 +0200 Subject: [PATCH 7/9] Improve comment --- pandas/tests/frame/test_alter_axes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 1b505b965dd65..fdeae61f96b93 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -204,7 +204,8 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, keys = [box1(df['A']), box2(df['A'])] # need to adapt first drop for case that both keys are 'A' -- - # can't drop the same column twice + # cannot drop the same column twice; + # use "is" because == would give ambiguous Boolean error for containers first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop # to test against already-tested behaviour, we add sequentially, From 9da6e6c9ce55d0c31f68e1e907aa18b0009a8222 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 18 Oct 2018 08:25:32 +0200 Subject: [PATCH 8/9] Retrigger Circle From 42d5f2a0646c2d5b4bd40647699b489a12713267 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 18 Oct 2018 18:17:35 +0200 Subject: [PATCH 9/9] Incorporate allow_sets-kwarg for is_list_like --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a28d3d3598551..c10d78ce55d0c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3991,7 +3991,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, # tuples that are not column keys are considered list-like, # not considered missing missing.append(col) - elif (not is_list_like(col) or isinstance(col, set) + elif (not is_list_like(col, allow_sets=False) or getattr(col, 'ndim', 1) > 1): raise TypeError('The parameter "keys" may only contain a ' 'combination of valid column keys and '