-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: Categoricals shouldn't allow non-strings when object dtype is passed (#13919) #14047
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,8 @@ | |
is_categorical_dtype, | ||
is_integer_dtype, is_bool, | ||
is_list_like, is_sequence, | ||
is_scalar) | ||
is_scalar, | ||
is_object_dtype) | ||
from pandas.core.common import is_null_slice | ||
|
||
from pandas.core.algorithms import factorize, take_1d | ||
|
@@ -191,6 +192,8 @@ class Categorical(PandasObject): | |
If an explicit ``ordered=True`` is given but no `categories` and the | ||
`values` are not sortable. | ||
|
||
If an `object` dtype is passed and `values` contains dtypes other | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mixed dtypes |
||
than all strings or all periods. | ||
|
||
Examples | ||
-------- | ||
|
@@ -324,6 +327,18 @@ def __init__(self, values, categories=None, ordered=False, | |
"mean to use\n'Categorical.from_codes(codes, " | ||
"categories)'?", RuntimeWarning, stacklevel=2) | ||
|
||
# TODO: disallow period when they stop being handled as object dtype | ||
# categoricals w/ object dtype shouldn't allow non-strings | ||
if is_object_dtype(categories) and len(categories) > 0: | ||
from pandas.lib import infer_dtype | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can be imported at the top |
||
mask = notnull(categories) | ||
if infer_dtype(categories[mask]) not in ['period', | ||
'unicode', | ||
'string']: | ||
raise TypeError( | ||
"Categoricals cannot be object dtype unless" | ||
" all values are strings or all are periods.") | ||
|
||
self.set_ordered(ordered or False, inplace=True) | ||
self._categories = categories | ||
self._codes = _coerce_indexer_dtype(codes, categories) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -94,13 +94,35 @@ def test_constructor_unsortable(self): | |
|
||
# it works! | ||
arr = np.array([1, 2, 3, datetime.now()], dtype='O') | ||
factor = Categorical.from_array(arr, ordered=False) | ||
self.assertFalse(factor.ordered) | ||
msg = "Categoricals cannot be object dtype unless all values are " \ | ||
"strings or all are periods." | ||
with tm.assertRaisesRegexp(TypeError, msg): | ||
factor = Categorical.from_array(arr, ordered=False) | ||
|
||
# this however will raise as cannot be sorted | ||
self.assertRaises( | ||
TypeError, lambda: Categorical.from_array(arr, ordered=True)) | ||
|
||
def test_constructor_object_dtype(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can u add tests:
I think some of them can't be covered because of |
||
#GH 13919 | ||
|
||
#categories must be of single dtype | ||
arr = np.array([1, 2, 3, 's'], dtype=object) | ||
msg = "Categoricals cannot be object dtype unless all values are " \ | ||
"strings or all are periods." | ||
with tm.assertRaisesRegexp(TypeError, msg): | ||
c = Categorical.from_array(arr) | ||
|
||
# object dtype allowed when all strs | ||
exp_arr = np.array(list('abcd'), dtype=object) | ||
c = Categorical.from_array(exp_arr) | ||
tm.assert_numpy_array_equal(c.__array__(), exp_arr) | ||
|
||
# object dtype also allowed when all periods | ||
idx = pd.period_range('1/1/2000', freq='D', periods=5) | ||
c = Categorical(idx) | ||
tm.assert_index_equal(c.categories, idx) | ||
|
||
def test_is_equal_dtype(self): | ||
|
||
# test dtype comparisons between cats | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it is a bug. Pls move this to API change section, and needs more detailed descriptions as it breaks existing user's code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just say mixed dtype categoricals