-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Parametrized NA sentinel for factorize #20473
Changes from all commits
872c24a
3c18428
703ab8a
ab32e0f
62fa538
28fad50
8580754
cf14ee1
8141131
a23d451
b25f3d4
dfcda85
eaff342
c05c807
e786253
465d458
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,7 +29,7 @@ | |
_ensure_float64, _ensure_uint64, | ||
_ensure_int64) | ||
from pandas.compat.numpy import _np_version_under1p10 | ||
from pandas.core.dtypes.missing import isna | ||
from pandas.core.dtypes.missing import isna, na_value_for_dtype | ||
|
||
from pandas.core import common as com | ||
from pandas._libs import algos, lib, hashtable as htable | ||
|
@@ -435,19 +435,23 @@ def isin(comps, values): | |
return f(comps, values) | ||
|
||
|
||
def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): | ||
def _factorize_array(values, na_sentinel=-1, size_hint=None, | ||
na_value=None): | ||
"""Factorize an array-like to labels and uniques. | ||
|
||
This doesn't do any coercion of types or unboxing before factorization. | ||
|
||
Parameters | ||
---------- | ||
values : ndarray | ||
check_nulls : bool | ||
Whether to check for nulls in the hashtable's 'get_labels' method. | ||
na_sentinel : int, default -1 | ||
size_hint : int, optional | ||
Passsed through to the hashtable's 'get_labels' method | ||
na_value : object, optional | ||
A value in `values` to consider missing. Note: only use this | ||
parameter when you know that you don't have any values pandas would | ||
consider missing in the array (NaN for float data, iNaT for | ||
datetimes, etc.). | ||
|
||
Returns | ||
------- | ||
|
@@ -457,7 +461,8 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): | |
|
||
table = hash_klass(size_hint or len(values)) | ||
uniques = vec_klass() | ||
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) | ||
labels = table.get_labels(values, uniques, 0, na_sentinel, | ||
na_value=na_value) | ||
|
||
labels = _ensure_platform_int(labels) | ||
uniques = uniques.to_array() | ||
|
@@ -508,10 +513,18 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | |
dtype = original.dtype | ||
else: | ||
values, dtype, _ = _ensure_data(values) | ||
check_nulls = not is_integer_dtype(original) | ||
labels, uniques = _factorize_array(values, check_nulls, | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know why |
||
if (is_datetime64_any_dtype(original) or | ||
is_timedelta64_dtype(original) or | ||
is_period_dtype(original)): | ||
na_value = na_value_for_dtype(original.dtype) | ||
else: | ||
na_value = None | ||
|
||
labels, uniques = _factorize_array(values, | ||
na_sentinel=na_sentinel, | ||
size_hint=size_hint) | ||
size_hint=size_hint, | ||
na_value=na_value) | ||
|
||
if sort and len(uniques) > 0: | ||
from pandas.core.sorting import safe_sort | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,8 @@ | |
from pandas import (NaT, Float64Index, Series, | ||
DatetimeIndex, TimedeltaIndex, date_range) | ||
from pandas.core.dtypes.common import is_scalar | ||
from pandas.core.dtypes.dtypes import DatetimeTZDtype | ||
from pandas.core.dtypes.dtypes import ( | ||
DatetimeTZDtype, PeriodDtype, IntervalDtype) | ||
from pandas.core.dtypes.missing import ( | ||
array_equivalent, isna, notna, isnull, notnull, | ||
na_value_for_dtype) | ||
|
@@ -311,23 +312,27 @@ def test_array_equivalent_str(): | |
np.array(['A', 'X'], dtype=dtype)) | ||
|
||
|
||
def test_na_value_for_dtype(): | ||
for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), | ||
DatetimeTZDtype('datetime64[ns, US/Eastern]')]: | ||
assert na_value_for_dtype(dtype) is NaT | ||
|
||
for dtype in ['u1', 'u2', 'u4', 'u8', | ||
'i1', 'i2', 'i4', 'i8']: | ||
assert na_value_for_dtype(np.dtype(dtype)) == 0 | ||
|
||
for dtype in ['bool']: | ||
assert na_value_for_dtype(np.dtype(dtype)) is False | ||
|
||
for dtype in ['f2', 'f4', 'f8']: | ||
assert np.isnan(na_value_for_dtype(np.dtype(dtype))) | ||
|
||
for dtype in ['O']: | ||
assert np.isnan(na_value_for_dtype(np.dtype(dtype))) | ||
@pytest.mark.parametrize('dtype, na_value', [ | ||
# Datetime-like | ||
(np.dtype("M8[ns]"), NaT), | ||
(np.dtype("m8[ns]"), NaT), | ||
(DatetimeTZDtype('datetime64[ns, US/Eastern]'), NaT), | ||
(PeriodDtype("M"), NaT), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice! |
||
# Integer | ||
('u1', 0), ('u2', 0), ('u4', 0), ('u8', 0), | ||
('i1', 0), ('i2', 0), ('i4', 0), ('i8', 0), | ||
# Bool | ||
('bool', False), | ||
# Float | ||
('f2', np.nan), ('f4', np.nan), ('f8', np.nan), | ||
# Object | ||
('O', np.nan), | ||
# Interval | ||
(IntervalDtype(), np.nan), | ||
]) | ||
def test_na_value_for_dtype(dtype, na_value): | ||
result = na_value_for_dtype(dtype) | ||
assert result is na_value | ||
|
||
|
||
class TestNAObj(object): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
when is float_group used? seems superfluous?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems to be used in
unique
.