From c9b091f64c8a1c092131bca49c91fe4186c8ae42 Mon Sep 17 00:00:00 2001 From: William Wagner Date: Wed, 17 Aug 2016 22:09:50 -0400 Subject: [PATCH 1/2] BUG: Categoricals shouldn't allow non-strings when object dtype is passed (#13919) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/categorical.py | 17 ++++++++++++++++- pandas/tests/test_categorical.py | 26 ++++++++++++++++++++++++-- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index cc3cc631b9575..335c8a0f3ed37 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1075,3 +1075,4 @@ Bug Fixes - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. +- Bug in ``Categorical`` would allow creation when ``object`` dtype was passed in with categories not containing either all non-string or all non-period values \ No newline at end of file diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 6ea0a5e96672d..7cd2f2f5dfa4e 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -20,7 +20,8 @@ is_categorical_dtype, is_integer_dtype, is_bool, is_list_like, is_sequence, - is_scalar) + is_scalar, + is_object_dtype) from pandas.core.common import is_null_slice from pandas.core.algorithms import factorize, take_1d @@ -191,6 +192,8 @@ class Categorical(PandasObject): If an explicit ``ordered=True`` is given but no `categories` and the `values` are not sortable. + If an `object` dtype is passed and `values` contains dtypes other + than all strings or all periods. Examples -------- @@ -324,6 +327,18 @@ def __init__(self, values, categories=None, ordered=False, "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) + # TODO: disallow period when they stop being handled as object dtype + # categoricals w/ object dtype shouldn't allow non-strings + if is_object_dtype(categories) and len(categories) > 0: + from pandas.lib import infer_dtype + mask = notnull(categories) + if infer_dtype(categories[mask]) not in ['period', + 'unicode', + 'string']: + raise TypeError( + "Categoricals cannot be object dtype unless" + " all values are strings or all are periods.") + self.set_ordered(ordered or False, inplace=True) self._categories = categories self._codes = _coerce_indexer_dtype(codes, categories) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index b630e0914259e..de880afc5cfc9 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -94,13 +94,35 @@ def test_constructor_unsortable(self): # it works! arr = np.array([1, 2, 3, datetime.now()], dtype='O') - factor = Categorical.from_array(arr, ordered=False) - self.assertFalse(factor.ordered) + msg = "Categoricals cannot be object dtype unless all values are " \ + "strings or all are periods." + with tm.assertRaisesRegexp(TypeError, msg): + factor = Categorical.from_array(arr, ordered=False) # this however will raise as cannot be sorted self.assertRaises( TypeError, lambda: Categorical.from_array(arr, ordered=True)) + def test_constructor_object_dtype(self): + #GH 13919 + + #categories must be of single dtype + arr = np.array([1, 2, 3, 's'], dtype=object) + msg = "Categoricals cannot be object dtype unless all values are " \ + "strings or all are periods." + with tm.assertRaisesRegexp(TypeError, msg): + c = Categorical.from_array(arr) + + # object dtype allowed when all strs + exp_arr = np.array(list('abcd'), dtype=object) + c = Categorical.from_array(exp_arr) + tm.assert_numpy_array_equal(c.__array__(), exp_arr) + + # object dtype also allowed when all periods + idx = pd.period_range('1/1/2000', freq='D', periods=5) + c = Categorical(idx) + tm.assert_index_equal(c.categories, idx) + def test_is_equal_dtype(self): # test dtype comparisons between cats From 2730600272f9bb10ebeeb0e0aef98e5a2a40d3d9 Mon Sep 17 00:00:00 2001 From: William Wagner Date: Fri, 19 Aug 2016 09:13:20 -0400 Subject: [PATCH 2/2] Fixed typo in whatsnew entry --- doc/source/whatsnew/v0.19.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 335c8a0f3ed37..6b3ea9ecc3866 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1075,4 +1075,4 @@ Bug Fixes - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. -- Bug in ``Categorical`` would allow creation when ``object`` dtype was passed in with categories not containing either all non-string or all non-period values \ No newline at end of file +- Bug in ``Categorical`` would allow creation when ``object`` dtype was passed in with categories not containing either all string or all period values \ No newline at end of file