From 872c24a0658a0a3deab91b8b76be46cf8fe34aed Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Mar 2018 11:21:09 -0500 Subject: [PATCH 01/14] ENH: Parametrized NA sentinel for factorize Adds a new keyword `na_value` to control the NA sentinel inside the factorize routine. ```python In [3]: arr = np.array([0, 1, 0, 2], dtype='u8') In [4]: pd.factorize(arr) Out[4]: (array([0, 1, 0, 2]), array([0, 1, 2], dtype=uint64)) In [5]: pd.factorize(arr, na_value=0) Out[5]: (array([-1, 0, -1, 1]), array([1, 2], dtype=uint64)) ``` --- pandas/_libs/hashtable.pxd | 25 ++++++++--- pandas/_libs/hashtable_class_helper.pxi.in | 50 ++++++++++++++++------ pandas/core/algorithms.py | 20 +++++++-- pandas/tests/test_algos.py | 25 +++++++++++ 4 files changed, 97 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index d735b3c0673b2..0599d9f4119be 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -8,32 +8,47 @@ cdef class HashTable: pass cdef class UInt64HashTable(HashTable): - cdef kh_uint64_t *table + cdef: + kh_uint64_t *table + uint64_t na_value + bint use_na_value cpdef get_item(self, uint64_t val) cpdef set_item(self, uint64_t key, Py_ssize_t val) cdef class Int64HashTable(HashTable): - cdef kh_int64_t *table + cdef: + kh_int64_t *table + int64_t na_value + bint use_na_value cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) cdef class Float64HashTable(HashTable): - cdef kh_float64_t *table + cdef: + kh_float64_t *table + float64_t na_value + bint use_na_value cpdef get_item(self, float64_t val) cpdef set_item(self, float64_t key, Py_ssize_t val) cdef class PyObjectHashTable(HashTable): - cdef kh_pymap_t *table + cdef: + kh_pymap_t *table + object na_value + bint use_na_value cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) cdef class StringHashTable(HashTable): - cdef kh_str_t *table + cdef: + kh_str_t *table + object na_value + bint use_na_value cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bca4e388f3279..bf291240a5dc3 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -250,13 +250,13 @@ cdef class HashTable: {{py: -# name, dtype, null_condition, float_group -dtypes = [('Float64', 'float64', 'val != val', True), - ('UInt64', 'uint64', 'False', False), - ('Int64', 'int64', 'val == iNaT', False)] +# name, dtype, null_condition, float_group, default_na_value +dtypes = [('Float64', 'float64', 'val != val', True, 'nan'), + ('UInt64', 'uint64', 'False', False, 0), + ('Int64', 'int64', 'val == iNaT', False, 'iNaT')] def get_dispatch(dtypes): - for (name, dtype, null_condition, float_group) in dtypes: + for (name, dtype, null_condition, float_group, default_na_value) in dtypes: unique_template = """\ cdef: Py_ssize_t i, n = len(values) @@ -300,16 +300,19 @@ def get_dispatch(dtypes): unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group) - yield (name, dtype, null_condition, float_group, unique_template) + yield (name, dtype, null_condition, float_group, default_na_value, unique_template) }} -{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}} +{{for name, dtype, null_condition, float_group, default_na_value, unique_template in get_dispatch(dtypes)}} cdef class {{name}}HashTable(HashTable): - def __cinit__(self, size_hint=1): + def __cinit__(self, size_hint=1, {{dtype}}_t na_value={{default_na_value}}, + bint use_na_value=False): self.table = kh_init_{{dtype}}() + self.na_value = na_value + self.use_na_value = use_na_value if size_hint is not None: kh_resize_{{dtype}}(self.table, size_hint) @@ -414,18 +417,22 @@ cdef class {{name}}HashTable(HashTable): int64_t[:] labels Py_ssize_t idx, count = count_prior int ret = 0 - {{dtype}}_t val + {{dtype}}_t val, na_value khiter_t k {{name}}VectorData *ud + bint use_na_value labels = np.empty(n, dtype=np.int64) ud = uniques.data + na_value = self.na_value + use_na_value = self.use_na_value with nogil: for i in range(n): val = values[i] - if check_null and {{null_condition}}: + if ((check_null and {{null_condition}}) or + (use_na_value and val == na_value)): labels[i] = na_sentinel continue @@ -519,8 +526,11 @@ cdef class StringHashTable(HashTable): # or a sentinel np.nan / None missing value na_string_sentinel = '__nan__' - def __init__(self, int size_hint=1): + def __init__(self, int size_hint=1, object na_value=na_string_sentinel, + bint use_na_value=False): self.table = kh_init_str() + self.na_value = na_value + self.use_na_value = use_na_value if size_hint is not None: kh_resize_str(self.table, size_hint) @@ -706,18 +716,23 @@ cdef class StringHashTable(HashTable): char *v char **vecs khiter_t k + bint use_na_value # these by-definition *must* be strings labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) + na_value = self.na_value + use_na_value = self.use_na_value + # pre-filter out missing # and assign pointers vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - if PyUnicode_Check(val) or PyString_Check(val): + if ((PyUnicode_Check(val) or PyString_Check(val)) and + not (use_na_value and val == na_value)): v = util.get_c_string(val) vecs[i] = v else: @@ -753,8 +768,11 @@ na_sentinel = object cdef class PyObjectHashTable(HashTable): - def __init__(self, size_hint=1): + def __init__(self, size_hint=1, object na_value=na_sentinel, + bint use_na_value=False): self.table = kh_init_pymap() + self.na_value = na_value + self.use_na_value = use_na_value kh_resize_pymap(self.table, size_hint) def __dealloc__(self): @@ -876,14 +894,18 @@ cdef class PyObjectHashTable(HashTable): int ret = 0 object val khiter_t k + bint use_na_value labels = np.empty(n, dtype=np.int64) + na_value = self.na_value + use_na_value = self.use_na_value for i in range(n): val = values[i] hash(val) - if check_null and val != val or val is None: + if ((check_null and val != val or val is None) or + (use_na_value and val == na_value)): labels[i] = na_sentinel continue diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index de2e638265f1e..d21022f5a8c22 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -435,7 +435,8 @@ def isin(comps, values): return f(comps, values) -def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): +def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, + na_value=None): """Factorize an array-like to labels and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -455,7 +456,13 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): """ (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) - table = hash_klass(size_hint or len(values)) + use_na_value = na_value is not None + kwargs = dict(use_na_value=use_na_value) + + if use_na_value: + kwargs['na_value'] = na_value + + table = hash_klass(size_hint or len(values), **kwargs) uniques = vec_klass() labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) @@ -465,7 +472,8 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): @deprecate_kwarg(old_arg_name='order', new_arg_name=None) -def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): +def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, + na_value=None): """ Encode input values as an enumerated type or categorical variable @@ -479,6 +487,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer + na_value : object, optional + A value in `values` to consider missing. Returns ------- @@ -509,9 +519,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): else: values, dtype, _ = _ensure_data(values) check_nulls = not is_integer_dtype(original) + labels, uniques = _factorize_array(values, check_nulls, na_sentinel=na_sentinel, - size_hint=size_hint) + size_hint=size_hint, + na_value=na_value) if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 884b1eb7342c6..0ec28e05e27fc 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -257,6 +257,31 @@ def test_deprecate_order(self): with tm.assert_produces_warning(False): algos.factorize(data) + @pytest.mark.parametrize('data', [ + np.array([0, 1, 0], dtype='u8'), + np.array([-2**63, 1, -2**63], dtype='i8'), + np.array(['__nan__', 'foo', '__nan__'], dtype='object'), + ]) + def test_parametrized_factorize_na_value_default(self, data): + # arrays that include the NA default for that type, but isn't used. + l, u = pd.factorize(data) + expected_uniques = data[[0, 1]] + expected_labels = np.array([0, 1, 0]) + tm.assert_numpy_array_equal(l, expected_labels) + tm.assert_numpy_array_equal(u, expected_uniques) + + @pytest.mark.parametrize('data, na_value', [ + (np.array([0, 1, 0, 2], dtype='u8'), 0), + (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63), + (np.array(['', 'a', '', 'b'], dtype=object), '') + ]) + def test_parametrized_factorize_na_value(self, data, na_value): + l, u = pd.factorize(data, na_value=na_value) + expected_uniques = data[[1, 3]] + expected_labels = np.array([-1, 0, -1, 1]) + tm.assert_numpy_array_equal(l, expected_labels) + tm.assert_numpy_array_equal(u, expected_uniques) + class TestUnique(object): From 3c184285e8ef3fbffd193985f324cfe42e0f3bab Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Mar 2018 15:07:11 -0500 Subject: [PATCH 02/14] REF: Moved to get_labels --- pandas/_libs/hashtable.pxd | 10 ------ pandas/_libs/hashtable_class_helper.pxi.in | 39 ++++++++-------------- pandas/core/algorithms.py | 5 +-- 3 files changed, 16 insertions(+), 38 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 0599d9f4119be..445f4f1c7e751 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -10,8 +10,6 @@ cdef class HashTable: cdef class UInt64HashTable(HashTable): cdef: kh_uint64_t *table - uint64_t na_value - bint use_na_value cpdef get_item(self, uint64_t val) cpdef set_item(self, uint64_t key, Py_ssize_t val) @@ -19,8 +17,6 @@ cdef class UInt64HashTable(HashTable): cdef class Int64HashTable(HashTable): cdef: kh_int64_t *table - int64_t na_value - bint use_na_value cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) @@ -28,8 +24,6 @@ cdef class Int64HashTable(HashTable): cdef class Float64HashTable(HashTable): cdef: kh_float64_t *table - float64_t na_value - bint use_na_value cpdef get_item(self, float64_t val) cpdef set_item(self, float64_t key, Py_ssize_t val) @@ -37,8 +31,6 @@ cdef class Float64HashTable(HashTable): cdef class PyObjectHashTable(HashTable): cdef: kh_pymap_t *table - object na_value - bint use_na_value cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) @@ -47,8 +39,6 @@ cdef class PyObjectHashTable(HashTable): cdef class StringHashTable(HashTable): cdef: kh_str_t *table - object na_value - bint use_na_value cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bf291240a5dc3..af60eb4b1d56d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -308,11 +308,8 @@ def get_dispatch(dtypes): cdef class {{name}}HashTable(HashTable): - def __cinit__(self, size_hint=1, {{dtype}}_t na_value={{default_na_value}}, - bint use_na_value=False): + def __cinit__(self, size_hint=1): self.table = kh_init_{{dtype}}() - self.na_value = na_value - self.use_na_value = use_na_value if size_hint is not None: kh_resize_{{dtype}}(self.table, size_hint) @@ -411,21 +408,20 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True): + bint check_null=True, + {{dtype}}_t na_value={{default_na_value}}, + bint use_na_value=False): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels Py_ssize_t idx, count = count_prior int ret = 0 - {{dtype}}_t val, na_value + {{dtype}}_t val khiter_t k {{name}}VectorData *ud - bint use_na_value labels = np.empty(n, dtype=np.int64) ud = uniques.data - na_value = self.na_value - use_na_value = self.use_na_value with nogil: for i in range(n): @@ -526,11 +522,8 @@ cdef class StringHashTable(HashTable): # or a sentinel np.nan / None missing value na_string_sentinel = '__nan__' - def __init__(self, int size_hint=1, object na_value=na_string_sentinel, - bint use_na_value=False): + def __init__(self, int size_hint=1): self.table = kh_init_str() - self.na_value = na_value - self.use_na_value = use_na_value if size_hint is not None: kh_resize_str(self.table, size_hint) @@ -705,7 +698,9 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=1): + bint check_null=1, + object na_value=na_string_sentinel, + bint use_na_value=False): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -716,15 +711,11 @@ cdef class StringHashTable(HashTable): char *v char **vecs khiter_t k - bint use_na_value # these by-definition *must* be strings labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) - na_value = self.na_value - use_na_value = self.use_na_value - # pre-filter out missing # and assign pointers vecs = malloc(n * sizeof(char *)) @@ -768,11 +759,8 @@ na_sentinel = object cdef class PyObjectHashTable(HashTable): - def __init__(self, size_hint=1, object na_value=na_sentinel, - bint use_na_value=False): + def __init__(self, size_hint=1): self.table = kh_init_pymap() - self.na_value = na_value - self.use_na_value = use_na_value kh_resize_pymap(self.table, size_hint) def __dealloc__(self): @@ -886,7 +874,9 @@ cdef class PyObjectHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): + bint check_null=True, + object na_value=na_sentinel, + bint use_na_value=False): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -894,11 +884,8 @@ cdef class PyObjectHashTable(HashTable): int ret = 0 object val khiter_t k - bint use_na_value labels = np.empty(n, dtype=np.int64) - na_value = self.na_value - use_na_value = self.use_na_value for i in range(n): val = values[i] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d21022f5a8c22..bbf60c2ee7875 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -462,9 +462,10 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, if use_na_value: kwargs['na_value'] = na_value - table = hash_klass(size_hint or len(values), **kwargs) + table = hash_klass(size_hint or len(values)) uniques = vec_klass() - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, + **kwargs) labels = _ensure_platform_int(labels) uniques = uniques.to_array() From 703ab8a2b1a96a89577740b99af7d09375608131 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Mar 2018 16:47:00 -0500 Subject: [PATCH 03/14] Remove python-level use_na_value --- pandas/_libs/hashtable_class_helper.pxi.in | 31 +++++++++++++++------- pandas/core/algorithms.py | 8 +----- pandas/tests/test_algos.py | 4 ++- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index af60eb4b1d56d..3376103d5c2af 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -409,26 +409,37 @@ cdef class {{name}}HashTable(HashTable): def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior, Py_ssize_t na_sentinel, bint check_null=True, - {{dtype}}_t na_value={{default_na_value}}, - bint use_na_value=False): + object na_value=None): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels Py_ssize_t idx, count = count_prior int ret = 0 - {{dtype}}_t val + {{dtype}}_t val, na_value2 khiter_t k {{name}}VectorData *ud + bint use_na_value labels = np.empty(n, dtype=np.int64) ud = uniques.data + use_na_value = na_value is not None + + if use_na_value: + # We need this na_value2 because we want to allow users + # to *optionally* specify an NA sentinel *of the correct* type. + # We use None, to make it optional, which requires `object` type + # for the parameter. To please the compiler, we use na_value2, + # which is only used if it's *specified*. + na_value2 = <{{dtype}}_t>na_value + else: + na_value2 = {{default_na_value}} with nogil: for i in range(n): val = values[i] if ((check_null and {{null_condition}}) or - (use_na_value and val == na_value)): + (use_na_value and val == na_value2)): labels[i] = na_sentinel continue @@ -699,22 +710,23 @@ cdef class StringHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, bint check_null=1, - object na_value=na_string_sentinel, - bint use_na_value=False): + object na_value=None): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels int64_t[:] uindexer Py_ssize_t idx, count = count_prior int ret = 0 - object val + object val, na_value2 char *v char **vecs khiter_t k + bint use_na_value # these by-definition *must* be strings labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) + use_na_value = na_value is not None # pre-filter out missing # and assign pointers @@ -875,8 +887,7 @@ cdef class PyObjectHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, bint check_null=True, - object na_value=na_sentinel, - bint use_na_value=False): + object na_value=None): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -884,8 +895,10 @@ cdef class PyObjectHashTable(HashTable): int ret = 0 object val khiter_t k + bint use_na_value labels = np.empty(n, dtype=np.int64) + use_na_value = na_value is not None for i in range(n): val = values[i] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bbf60c2ee7875..86a33faf5449f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -456,16 +456,10 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, """ (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) - use_na_value = na_value is not None - kwargs = dict(use_na_value=use_na_value) - - if use_na_value: - kwargs['na_value'] = na_value - table = hash_klass(size_hint or len(values)) uniques = vec_klass() labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, - **kwargs) + na_value=na_value) labels = _ensure_platform_int(labels) uniques = uniques.to_array() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 0ec28e05e27fc..c7ab3c53613ee 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -272,8 +272,10 @@ def test_parametrized_factorize_na_value_default(self, data): @pytest.mark.parametrize('data, na_value', [ (np.array([0, 1, 0, 2], dtype='u8'), 0), + (np.array([1, 0, 1, 2], dtype='u8'), 1), (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63), - (np.array(['', 'a', '', 'b'], dtype=object), '') + (np.array([1, -2**63, 1, 0], dtype='i8'), 1), + (np.array(['a', '', 'a', 'b'], dtype=object), 'a') ]) def test_parametrized_factorize_na_value(self, data, na_value): l, u = pd.factorize(data, na_value=na_value) From ab32e0fe3922e2b4743c68ec0fed535910ef6137 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Mar 2018 20:24:38 -0500 Subject: [PATCH 04/14] REF: More cleanup Removed null_condition from the template. Had Categorical use na_value --- pandas/_libs/hashtable_class_helper.pxi.in | 19 +++++++++---------- pandas/core/algorithms.py | 2 +- pandas/core/arrays/categorical.py | 4 ++-- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3376103d5c2af..b21954f9a6601 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -250,13 +250,13 @@ cdef class HashTable: {{py: -# name, dtype, null_condition, float_group, default_na_value -dtypes = [('Float64', 'float64', 'val != val', True, 'nan'), - ('UInt64', 'uint64', 'False', False, 0), - ('Int64', 'int64', 'val == iNaT', False, 'iNaT')] +# name, dtype, float_group, default_na_value +dtypes = [('Float64', 'float64', True, 'nan'), + ('UInt64', 'uint64', False, 0), + ('Int64', 'int64', False, 'iNaT')] def get_dispatch(dtypes): - for (name, dtype, null_condition, float_group, default_na_value) in dtypes: + for (name, dtype, float_group, default_na_value) in dtypes: unique_template = """\ cdef: Py_ssize_t i, n = len(values) @@ -298,13 +298,13 @@ def get_dispatch(dtypes): return uniques.to_array() """ - unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group) + unique_template = unique_template.format(name=name, dtype=dtype, float_group=float_group) - yield (name, dtype, null_condition, float_group, default_na_value, unique_template) + yield (name, dtype, float_group, default_na_value, unique_template) }} -{{for name, dtype, null_condition, float_group, default_na_value, unique_template in get_dispatch(dtypes)}} +{{for name, dtype, float_group, default_na_value, unique_template in get_dispatch(dtypes)}} cdef class {{name}}HashTable(HashTable): @@ -438,8 +438,7 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if ((check_null and {{null_condition}}) or - (use_na_value and val == na_value2)): + if check_null and (val != val or val == na_value2): labels[i] = na_sentinel continue diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 86a33faf5449f..88212a4d69bdc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -513,7 +513,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - check_nulls = not is_integer_dtype(original) + check_nulls = na_value is not None or not is_integer_dtype(original) labels, uniques = _factorize_array(values, check_nulls, na_sentinel=na_sentinel, diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6eadef37da344..7a0da69fdbcea 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2163,11 +2163,11 @@ def factorize(self, na_sentinel=-1): from pandas.core.algorithms import _factorize_array codes = self.codes.astype('int64') - codes[codes == -1] = iNaT # We set missing codes, normally -1, to iNaT so that the # Int64HashTable treats them as missing values. labels, uniques = _factorize_array(codes, check_nulls=True, - na_sentinel=na_sentinel) + na_sentinel=na_sentinel, + na_value=-1) uniques = self._constructor(self.categories.take(uniques), categories=self.categories, ordered=self.ordered) From 62fa538963cfd6364f569b5a19d9d06718650058 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 06:38:22 -0500 Subject: [PATCH 05/14] API: Make it non-public --- pandas/core/algorithms.py | 17 ++++++++++------- pandas/tests/test_algos.py | 5 +++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 88212a4d69bdc..dfefa7cbe3a82 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -446,15 +446,22 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, values : ndarray check_nulls : bool Whether to check for nulls in the hashtable's 'get_labels' method. + Nulls are always checked when `na_value` is specified. na_sentinel : int, default -1 size_hint : int, optional Passsed through to the hashtable's 'get_labels' method + na_value : object, optional + A value in `values` to consider missing. Note: only use this + parameter when you know that you don't have any values pandas would + consider missing in the array (NaN for float data, iNaT for + datetimes, etc.). Returns ------- labels, uniques : ndarray """ (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) + check_nulls = check_nulls or na_value is not None table = hash_klass(size_hint or len(values)) uniques = vec_klass() @@ -467,8 +474,7 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, @deprecate_kwarg(old_arg_name='order', new_arg_name=None) -def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, - na_value=None): +def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable @@ -482,8 +488,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer - na_value : object, optional - A value in `values` to consider missing. Returns ------- @@ -513,12 +517,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - check_nulls = na_value is not None or not is_integer_dtype(original) + check_nulls = not is_integer_dtype(original) labels, uniques = _factorize_array(values, check_nulls, na_sentinel=na_sentinel, - size_hint=size_hint, - na_value=na_value) + size_hint=size_hint) if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c7ab3c53613ee..fa3f6e775da28 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -264,7 +264,7 @@ def test_deprecate_order(self): ]) def test_parametrized_factorize_na_value_default(self, data): # arrays that include the NA default for that type, but isn't used. - l, u = pd.factorize(data) + l, u = algos.factorize(data) expected_uniques = data[[0, 1]] expected_labels = np.array([0, 1, 0]) tm.assert_numpy_array_equal(l, expected_labels) @@ -278,7 +278,8 @@ def test_parametrized_factorize_na_value_default(self, data): (np.array(['a', '', 'a', 'b'], dtype=object), 'a') ]) def test_parametrized_factorize_na_value(self, data, na_value): - l, u = pd.factorize(data, na_value=na_value) + l, u = algos._factorize_array(data, check_nulls=True, + na_value=na_value) expected_uniques = data[[1, 3]] expected_labels = np.array([-1, 0, -1, 1]) tm.assert_numpy_array_equal(l, expected_labels) From 28fad508371466992b65a8e8cefb4866170734aa Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 06:54:30 -0500 Subject: [PATCH 06/14] Revert formatting changes in pxd --- pandas/_libs/hashtable.pxd | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 445f4f1c7e751..d735b3c0673b2 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -8,37 +8,32 @@ cdef class HashTable: pass cdef class UInt64HashTable(HashTable): - cdef: - kh_uint64_t *table + cdef kh_uint64_t *table cpdef get_item(self, uint64_t val) cpdef set_item(self, uint64_t key, Py_ssize_t val) cdef class Int64HashTable(HashTable): - cdef: - kh_int64_t *table + cdef kh_int64_t *table cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) cdef class Float64HashTable(HashTable): - cdef: - kh_float64_t *table + cdef kh_float64_t *table cpdef get_item(self, float64_t val) cpdef set_item(self, float64_t key, Py_ssize_t val) cdef class PyObjectHashTable(HashTable): - cdef: - kh_pymap_t *table + cdef kh_pymap_t *table cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) cdef class StringHashTable(HashTable): - cdef: - kh_str_t *table + cdef kh_str_t *table cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) From 85807545ef2a93ff2585920fc233fcb5bc3c2959 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 07:00:36 -0500 Subject: [PATCH 07/14] linting --- pandas/core/arrays/categorical.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7a0da69fdbcea..b25f23cde5c49 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -7,7 +7,6 @@ from pandas import compat from pandas.compat import u, lzip from pandas._libs import lib, algos as libalgos -from pandas._libs.tslib import iNaT from pandas.core.dtypes.generic import ( ABCSeries, ABCIndexClass, ABCCategoricalIndex) From cf14ee18451333c95ae03e0017ca34150f1d8b25 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 07:35:54 -0500 Subject: [PATCH 08/14] Handle bool --- pandas/core/algorithms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index dfefa7cbe3a82..bf192cdb2c300 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -517,7 +517,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - check_nulls = not is_integer_dtype(original) + check_nulls = (not is_integer_dtype(original) and + not is_bool_dtype(original)) labels, uniques = _factorize_array(values, check_nulls, na_sentinel=na_sentinel, From a23d451e08b43ba00f4b3ea8a87f2074e0ceda11 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 13:55:56 -0500 Subject: [PATCH 09/14] Specify dtypes --- pandas/tests/test_algos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index fa3f6e775da28..ecad2cc042cb3 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -266,7 +266,7 @@ def test_parametrized_factorize_na_value_default(self, data): # arrays that include the NA default for that type, but isn't used. l, u = algos.factorize(data) expected_uniques = data[[0, 1]] - expected_labels = np.array([0, 1, 0]) + expected_labels = np.array([0, 1, 0], dtype='i8') tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) @@ -281,7 +281,7 @@ def test_parametrized_factorize_na_value(self, data, na_value): l, u = algos._factorize_array(data, check_nulls=True, na_value=na_value) expected_uniques = data[[1, 3]] - expected_labels = np.array([-1, 0, -1, 1]) + expected_labels = np.array([-1, 0, -1, 1], dtype='i8') tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) From b25f3d498cf6d2cd7ea82cab3cd32e4e8974139b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 17:03:29 -0500 Subject: [PATCH 10/14] Remove unused variable. Added PyObject hashtable test --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- pandas/tests/test_algos.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b21954f9a6601..c72f644823ccb 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -716,7 +716,7 @@ cdef class StringHashTable(HashTable): int64_t[:] uindexer Py_ssize_t idx, count = count_prior int ret = 0 - object val, na_value2 + object val char *v char **vecs khiter_t k diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ecad2cc042cb3..5c42caa1720bb 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -275,7 +275,10 @@ def test_parametrized_factorize_na_value_default(self, data): (np.array([1, 0, 1, 2], dtype='u8'), 1), (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63), (np.array([1, -2**63, 1, 0], dtype='i8'), 1), - (np.array(['a', '', 'a', 'b'], dtype=object), 'a') + (np.array(['a', '', 'a', 'b'], dtype=object), 'a'), + (np.array([(), ('a', 1), (), ('a', 2)], dtype=object), ()), + (np.array([('a', 1), (), ('a', 1), ('a', 2)], dtype=object), + ('a', 1)), ]) def test_parametrized_factorize_na_value(self, data, na_value): l, u = algos._factorize_array(data, check_nulls=True, From dfcda85521502147b4e51d8300e05de7cb78b9dd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 19:09:01 -0500 Subject: [PATCH 11/14] REF: Removed check_nulls --- pandas/_libs/hashtable_class_helper.pxi.in | 7 ++----- pandas/core/algorithms.py | 21 +++++++++++---------- pandas/core/arrays/categorical.py | 3 +-- pandas/tests/test_algos.py | 3 +-- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c72f644823ccb..eca66f78499db 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -408,7 +408,6 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True, object na_value=None): cdef: Py_ssize_t i, n = len(values) @@ -438,7 +437,7 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if check_null and (val != val or val == na_value2): + if val != val or (use_na_value and val == na_value2): labels[i] = na_sentinel continue @@ -708,7 +707,6 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=1, object na_value=None): cdef: Py_ssize_t i, n = len(values) @@ -885,7 +883,6 @@ cdef class PyObjectHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True, object na_value=None): cdef: Py_ssize_t i, n = len(values) @@ -903,7 +900,7 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - if ((check_null and val != val or val is None) or + if ((val != val or val is None) or (use_na_value and val == na_value)): labels[i] = na_sentinel continue diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bf192cdb2c300..3b75b189c0123 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -435,7 +435,7 @@ def isin(comps, values): return f(comps, values) -def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, +def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None): """Factorize an array-like to labels and uniques. @@ -444,9 +444,6 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, Parameters ---------- values : ndarray - check_nulls : bool - Whether to check for nulls in the hashtable's 'get_labels' method. - Nulls are always checked when `na_value` is specified. na_sentinel : int, default -1 size_hint : int, optional Passsed through to the hashtable's 'get_labels' method @@ -461,11 +458,10 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, labels, uniques : ndarray """ (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) - check_nulls = check_nulls or na_value is not None table = hash_klass(size_hint or len(values)) uniques = vec_klass() - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, + labels = table.get_labels(values, uniques, 0, na_sentinel, na_value=na_value) labels = _ensure_platform_int(labels) @@ -517,12 +513,17 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - check_nulls = (not is_integer_dtype(original) and - not is_bool_dtype(original)) - labels, uniques = _factorize_array(values, check_nulls, + if (is_datetime64_any_dtype(original) or + is_timedelta64_dtype(original)): + na_value = iNaT + else: + na_value = None + + labels, uniques = _factorize_array(values, na_sentinel=na_sentinel, - size_hint=size_hint) + size_hint=size_hint, + na_value=na_value) if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b25f23cde5c49..ac57660300be4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2164,8 +2164,7 @@ def factorize(self, na_sentinel=-1): codes = self.codes.astype('int64') # We set missing codes, normally -1, to iNaT so that the # Int64HashTable treats them as missing values. - labels, uniques = _factorize_array(codes, check_nulls=True, - na_sentinel=na_sentinel, + labels, uniques = _factorize_array(codes, na_sentinel=na_sentinel, na_value=-1) uniques = self._constructor(self.categories.take(uniques), categories=self.categories, diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 5c42caa1720bb..ada4f880e92a4 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -281,8 +281,7 @@ def test_parametrized_factorize_na_value_default(self, data): ('a', 1)), ]) def test_parametrized_factorize_na_value(self, data, na_value): - l, u = algos._factorize_array(data, check_nulls=True, - na_value=na_value) + l, u = algos._factorize_array(data, na_value=na_value) expected_uniques = data[[1, 3]] expected_labels = np.array([-1, 0, -1, 1], dtype='i8') tm.assert_numpy_array_equal(l, expected_labels) From eaff342dbce56d8b1b8ea09f840477e27754c31b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Mar 2018 06:37:09 -0500 Subject: [PATCH 12/14] BUG: NaT for period --- pandas/core/algorithms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3b75b189c0123..fcc07dfa3f3a8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -515,7 +515,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): values, dtype, _ = _ensure_data(values) if (is_datetime64_any_dtype(original) or - is_timedelta64_dtype(original)): + is_timedelta64_dtype(original) or + is_period_dtype(original)): na_value = iNaT else: na_value = None From e786253a1e377ed8950268878ef42c2d86443d54 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Mar 2018 06:51:09 -0500 Subject: [PATCH 13/14] Other hashtable --- pandas/_libs/hashtable.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 07b4b80603e03..15d93374da3a9 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -70,7 +70,7 @@ cdef class Factorizer: return self.count def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1, - check_null=True): + na_value=None): """ Factorize values with nans replaced by na_sentinel >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) @@ -81,7 +81,7 @@ cdef class Factorizer: uniques.extend(self.uniques.to_array()) self.uniques = uniques labels = self.table.get_labels(values, self.uniques, - self.count, na_sentinel, check_null) + self.count, na_sentinel, na_value) mask = (labels == na_sentinel) # sort on if sort: @@ -114,7 +114,7 @@ cdef class Int64Factorizer: return self.count def factorize(self, int64_t[:] values, sort=False, - na_sentinel=-1, check_null=True): + na_sentinel=-1, na_value=None): """ Factorize values with nans replaced by na_sentinel >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) @@ -126,7 +126,7 @@ cdef class Int64Factorizer: self.uniques = uniques labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, - check_null) + na_value=na_value) # sort on if sort: From 465d458573c99ee218299c2713c2e5abd305907c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Mar 2018 07:14:48 -0500 Subject: [PATCH 14/14] na_value_for_dtype PeriodDtype --- pandas/core/algorithms.py | 4 +-- pandas/core/dtypes/missing.py | 3 ++- pandas/tests/dtypes/test_missing.py | 41 ++++++++++++++++------------- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index fcc07dfa3f3a8..45f86f044a4b2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -29,7 +29,7 @@ _ensure_float64, _ensure_uint64, _ensure_int64) from pandas.compat.numpy import _np_version_under1p10 -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas.core import common as com from pandas._libs import algos, lib, hashtable as htable @@ -517,7 +517,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if (is_datetime64_any_dtype(original) or is_timedelta64_dtype(original) or is_period_dtype(original)): - na_value = iNaT + na_value = na_value_for_dtype(original.dtype) else: na_value = None diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 01c88c269e7e0..7be00cbfd567a 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -11,6 +11,7 @@ is_datetimelike_v_numeric, is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_interval_dtype, + is_period_dtype, is_complex_dtype, is_string_like_dtype, is_bool_dtype, is_integer_dtype, is_dtype_equal, @@ -393,7 +394,7 @@ def na_value_for_dtype(dtype, compat=True): dtype = pandas_dtype(dtype) if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or - is_timedelta64_dtype(dtype)): + is_timedelta64_dtype(dtype) or is_period_dtype(dtype)): return NaT elif is_float_dtype(dtype): return np.nan diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 4f208bc352c70..365d8d762d673 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -15,7 +15,8 @@ from pandas import (NaT, Float64Index, Series, DatetimeIndex, TimedeltaIndex, date_range) from pandas.core.dtypes.common import is_scalar -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, PeriodDtype, IntervalDtype) from pandas.core.dtypes.missing import ( array_equivalent, isna, notna, isnull, notnull, na_value_for_dtype) @@ -311,23 +312,27 @@ def test_array_equivalent_str(): np.array(['A', 'X'], dtype=dtype)) -def test_na_value_for_dtype(): - for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), - DatetimeTZDtype('datetime64[ns, US/Eastern]')]: - assert na_value_for_dtype(dtype) is NaT - - for dtype in ['u1', 'u2', 'u4', 'u8', - 'i1', 'i2', 'i4', 'i8']: - assert na_value_for_dtype(np.dtype(dtype)) == 0 - - for dtype in ['bool']: - assert na_value_for_dtype(np.dtype(dtype)) is False - - for dtype in ['f2', 'f4', 'f8']: - assert np.isnan(na_value_for_dtype(np.dtype(dtype))) - - for dtype in ['O']: - assert np.isnan(na_value_for_dtype(np.dtype(dtype))) +@pytest.mark.parametrize('dtype, na_value', [ + # Datetime-like + (np.dtype("M8[ns]"), NaT), + (np.dtype("m8[ns]"), NaT), + (DatetimeTZDtype('datetime64[ns, US/Eastern]'), NaT), + (PeriodDtype("M"), NaT), + # Integer + ('u1', 0), ('u2', 0), ('u4', 0), ('u8', 0), + ('i1', 0), ('i2', 0), ('i4', 0), ('i8', 0), + # Bool + ('bool', False), + # Float + ('f2', np.nan), ('f4', np.nan), ('f8', np.nan), + # Object + ('O', np.nan), + # Interval + (IntervalDtype(), np.nan), +]) +def test_na_value_for_dtype(dtype, na_value): + result = na_value_for_dtype(dtype) + assert result is na_value class TestNAObj(object):