From 73569971bc029db80499589c26dfa70fef9c0d67 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Mar 2019 15:55:07 +0100 Subject: [PATCH 01/10] BUG: fix usage of na_sentinel with sort=True in factorize() --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/core/algorithms.py | 20 +++++++++++++------- pandas/tests/test_algos.py | 15 +++++++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 7da99590d5a0a..839754b828186 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -32,6 +32,7 @@ Fixed Regressions - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) +- Fixed regression in :func:`factorize` when passing a custom ``na_sentinel`` value with ``sort=True`` (:issue:`25409`). .. _whatsnew_0242.enhancements: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4a71951e2435e..5ed2e3efe26a1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -619,13 +619,19 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort - try: - order = uniques.argsort() - order2 = order.argsort() - labels = take_1d(order2, labels, fill_value=na_sentinel) - uniques = uniques.take(order) - except TypeError: - # Mixed types, where uniques.argsort fails. + if na_sentinel == -1: + # GH-25409 take_1d only works for na_sentinels of -1 + try: + order = uniques.argsort() + order2 = order.argsort() + labels = take_1d(order2, labels, fill_value=na_sentinel) + uniques = uniques.take(order) + except TypeError: + # Mixed types, where uniques.argsort fails. + uniques, labels = safe_sort(uniques, labels, + na_sentinel=na_sentinel, + assume_unique=True) + else: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3f75c508d22f9..7c009f6a2633c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -326,6 +326,21 @@ def test_parametrized_factorize_na_value(self, data, na_value): tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) + @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize('na_sentinel', [-1, -10, 100]) + def test_factorize_na_sentinel(self, sort, na_sentinel): + data = np.array(['b', 'a', None, 'b'], dtype=object) + labels, uniques = algos.factorize(data, sort=sort, + na_sentinel=na_sentinel) + if sort: + expected_labels = np.array([1, 0, na_sentinel, 1]) + expected_uniques = np.array(['a', 'b'], dtype=object) + else: + expected_labels = np.array([0, 1, na_sentinel, 0]) + expected_uniques = np.array(['b', 'a'], dtype=object) + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(uniques, expected_uniques) + class TestUnique(object): From e1ab3a489b1a5825527312d53dcc20f1545f04d6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 18:20:13 +0100 Subject: [PATCH 02/10] fix dtype --- pandas/tests/test_algos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 7c009f6a2633c..083307371b699 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -333,10 +333,10 @@ def test_factorize_na_sentinel(self, sort, na_sentinel): labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: - expected_labels = np.array([1, 0, na_sentinel, 1]) + expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp) expected_uniques = np.array(['a', 'b'], dtype=object) else: - expected_labels = np.array([0, 1, na_sentinel, 0]) + expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp) expected_uniques = np.array(['b', 'a'], dtype=object) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_numpy_array_equal(uniques, expected_uniques) From ba944eb09a784605db9a3d51aa154f1f29aa7a5f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Mar 2019 18:49:51 +0100 Subject: [PATCH 03/10] Attempt to include it in safe_sort --- pandas/core/algorithms.py | 19 +++------------- pandas/core/sorting.py | 46 +++++++++++++++++++++++++++------------ 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5ed2e3efe26a1..5f9640308f289 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -619,22 +619,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort - if na_sentinel == -1: - # GH-25409 take_1d only works for na_sentinels of -1 - try: - order = uniques.argsort() - order2 = order.argsort() - labels = take_1d(order2, labels, fill_value=na_sentinel) - uniques = uniques.take(order) - except TypeError: - # Mixed types, where uniques.argsort fails. - uniques, labels = safe_sort(uniques, labels, - na_sentinel=na_sentinel, - assume_unique=True) - else: - uniques, labels = safe_sort(uniques, labels, - na_sentinel=na_sentinel, - assume_unique=True) + uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, + assume_unique=True, + check_outofbounds=False) uniques = _reconstruct_data(uniques, dtype, original) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 0b5b017bec9ac..5ffb1edf68f3e 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, is_categorical_dtype, is_list_like) from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.generic import ABCExtensionArray import pandas.core.algorithms as algorithms @@ -404,7 +405,8 @@ def _reorder_by_uniques(uniques, labels): return uniques, labels -def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, + check_outofbounds=True): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. @@ -425,6 +427,10 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. + check_outofbounds : bool, default True + Check if labels are out of bound for the values and put out of bound + labels equal to na_sentinel. If ``check_outofbounds=False``, it is + assumed there are no out of bound labels. Returns ------- @@ -446,8 +452,8 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") - if not isinstance(values, np.ndarray): - + if (not isinstance(values, np.ndarray) + and not isinstance(values, ABCExtensionArray)): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) @@ -461,7 +467,8 @@ def sort_mixed(values): return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None - if PY3 and lib.infer_dtype(values, skipna=False) == 'mixed-integer': + if (PY3 and not isinstance(values, ABCExtensionArray) + and lib.infer_dtype(values, skipna=False) == 'mixed-integer'): # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: @@ -494,15 +501,26 @@ def sort_mixed(values): t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) - reverse_indexer = np.empty(len(sorter), dtype=np.int_) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - mask = (labels < -len(values)) | (labels >= len(values)) | \ - (labels == na_sentinel) - - # (Out of bound indices will be masked with `na_sentinel` next, so we may - # deal with them here without performance loss using `mode='wrap'`.) - new_labels = reverse_indexer.take(labels, mode='wrap') - np.putmask(new_labels, mask, na_sentinel) + if na_sentinel == -1: + # take_1d is faster, but only works for na_sentinels of -1 + order2 = sorter.argsort() + new_labels = algorithms.take_1d(order2, labels, fill_value=-1) + if check_outofbounds: + mask = (labels < -len(values)) | (labels >= len(values)) + else: + mask = None + else: + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + # Out of bound indices will be masked with `na_sentinel` next, so we + # may deal with them here without performance loss using `mode='wrap'` + new_labels = reverse_indexer.take(labels, mode='wrap') + + mask = labels == na_sentinel + if check_outofbounds: + mask = mask | (labels < -len(values)) | (labels >= len(values)) + + if mask is not None: + np.putmask(new_labels, mask, na_sentinel) return ordered, ensure_platform_int(new_labels) From fdf330aab9740209c76b33e694e59801529ea197 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Apr 2019 09:19:40 +0200 Subject: [PATCH 04/10] feedback Jeff --- pandas/core/algorithms.py | 3 +-- pandas/core/sorting.py | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index fe83d377721f0..fc74742ace3e0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -618,8 +618,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, - assume_unique=True, - check_outofbounds=False) + assume_unique=True, verify=False) uniques = _reconstruct_data(uniques, dtype, original) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index af6bfa22b3489..85b2eb3bafde2 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -9,9 +9,9 @@ from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, is_categorical_dtype, is_list_like) + ensure_int64, ensure_platform_int, is_categorical_dtype, + is_extension_array_dtype, is_list_like) from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.generic import ABCExtensionArray import pandas.core.algorithms as algorithms @@ -406,7 +406,7 @@ def _reorder_by_uniques(uniques, labels): def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, - check_outofbounds=True): + verify=True): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. @@ -427,10 +427,12 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. - check_outofbounds : bool, default True + verify : bool, default True Check if labels are out of bound for the values and put out of bound - labels equal to na_sentinel. If ``check_outofbounds=False``, it is - assumed there are no out of bound labels. + labels equal to na_sentinel. If ``verify=False``, it is assumed there + are no out of bound labels. + + .. versionadded:: 0.25.0 Returns ------- @@ -453,7 +455,7 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, "safe_sort as values") if (not isinstance(values, np.ndarray) - and not isinstance(values, ABCExtensionArray)): + and not is_extension_array_dtype(values)): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) @@ -467,7 +469,7 @@ def sort_mixed(values): return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None - if (PY3 and not isinstance(values, ABCExtensionArray) + if (not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == 'mixed-integer'): # unorderable in py3 if mixed str/int ordered = sort_mixed(values) @@ -505,7 +507,7 @@ def sort_mixed(values): # take_1d is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() new_labels = algorithms.take_1d(order2, labels, fill_value=-1) - if check_outofbounds: + if verify: mask = (labels < -len(values)) | (labels >= len(values)) else: mask = None @@ -517,7 +519,7 @@ def sort_mixed(values): new_labels = reverse_indexer.take(labels, mode='wrap') mask = labels == na_sentinel - if check_outofbounds: + if verify: mask = mask | (labels < -len(values)) | (labels >= len(values)) if mask is not None: From b08ea6d4761839e6ae52c29db1458c1d61115880 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Apr 2019 09:52:34 +0200 Subject: [PATCH 05/10] add tests for safe_sort --- pandas/core/sorting.py | 2 +- pandas/tests/test_sorting.py | 43 +++++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 85b2eb3bafde2..3e016da8260b5 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -430,7 +430,7 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify : bool, default True Check if labels are out of bound for the values and put out of bound labels equal to na_sentinel. If ``verify=False``, it is assumed there - are no out of bound labels. + are no out of bound labels. Ignored when ``labels`` is None. .. versionadded:: 0.25.0 diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 04a50cf6facd5..86c3734c55db2 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -7,7 +7,7 @@ import pytest from pandas import ( - DataFrame, MultiIndex, Series, compat, concat, merge, to_datetime) + DataFrame, MultiIndex, Series, array, compat, concat, merge, to_datetime) from pandas.core import common as com from pandas.core.sorting import ( decons_group_index, get_group_index, is_int64_overflow_possible, @@ -359,34 +359,39 @@ def test_basic_sort(self): expected = np.array([]) tm.assert_numpy_array_equal(result, expected) - def test_labels(self): + @pytest.mark.parametrize('verify', [True, False]) + def test_labels(self, verify): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) labels = [0, 1, 1, 2, 3, 0, -1, 4] - result, result_labels = safe_sort(values, labels) + result, result_labels = safe_sort(values, labels, verify=verify) expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) # na_sentinel labels = [0, 1, 1, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels, - na_sentinel=99) + result, result_labels = safe_sort(values, labels, na_sentinel=99, + verify=verify) expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) - # out of bound indices - labels = [0, 101, 102, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels) - expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp) + labels = [] + result, result_labels = safe_sort(values, labels, verify=verify) + expected_labels = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) - labels = [] + def test_labels_out_of_bound(self): + values = [3, 1, 2, 0, 4] + expected = np.array([0, 1, 2, 3, 4]) + + # out of bound indices + labels = [0, 101, 102, 2, 3, 0, 99, 4] result, result_labels = safe_sort(values, labels) - expected_labels = np.array([], dtype=np.intp) + expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) @@ -431,3 +436,19 @@ def test_exceptions(self): with pytest.raises(ValueError, match="values should be unique"): safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) + + @pytest.mark.parametrize('verify', [True, False]) + def test_extension_array(self, verify): + # a = array([1, 3, np.nan, 2], dtype='Int64') + a = array([1, 3, 2], dtype='Int64') + result = safe_sort(a) + # expected = array([1, 2, 3, np.nan], dtype='Int64') + expected = array([1, 2, 3], dtype='Int64') + tm.assert_extension_array_equal(result, expected) + + a = array([1, 3, 2], dtype='Int64') + result, labels = safe_sort(a, [0, 1, 2], verify=verify) + expected_values = array([1, 2, 3], dtype='Int64') + expected_labels = np.array([0, 2, 1], dtype=np.intp) + tm.assert_extension_array_equal(result, expected_values) + tm.assert_numpy_array_equal(labels, expected_labels) From 9de26fc83c39b4fc8863a9a827eedf7b1a117ea0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Apr 2019 09:58:02 +0200 Subject: [PATCH 06/10] additional test for other na_sentinel in case of out of bound indices --- pandas/tests/test_sorting.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 86c3734c55db2..37d2da9b45508 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -384,14 +384,18 @@ def test_labels(self, verify): tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) - def test_labels_out_of_bound(self): + @pytest.mark.parametrize('na_sentinel', [-1, 99]) + def test_labels_out_of_bound(self, na_sentinel): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) # out of bound indices labels = [0, 101, 102, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels) - expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp) + result, result_labels = safe_sort( + values, labels, na_sentinel=na_sentinel) + expected_labels = np.array( + [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], + dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) From bcb8c7e37f7b7b8bba8e3f504696d9c7638fcbc0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Apr 2019 10:06:00 +0200 Subject: [PATCH 07/10] additional test for EA with custom na_sentinel --- pandas/tests/test_sorting.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 37d2da9b45508..b0dd2d5a1c27e 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -441,8 +441,7 @@ def test_exceptions(self): match="values should be unique"): safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) - @pytest.mark.parametrize('verify', [True, False]) - def test_extension_array(self, verify): + def test_extension_array(self): # a = array([1, 3, np.nan, 2], dtype='Int64') a = array([1, 3, 2], dtype='Int64') result = safe_sort(a) @@ -450,9 +449,13 @@ def test_extension_array(self, verify): expected = array([1, 2, 3], dtype='Int64') tm.assert_extension_array_equal(result, expected) + @pytest.mark.parametrize('verify', [True, False]) + @pytest.mark.parametrize('na_sentinel', [-1, 99]) + def test_extension_array_labels(self, verify, na_sentinel): a = array([1, 3, 2], dtype='Int64') - result, labels = safe_sort(a, [0, 1, 2], verify=verify) + result, labels = safe_sort(a, [0, 1, na_sentinel, 2], + na_sentinel=na_sentinel, verify=verify) expected_values = array([1, 2, 3], dtype='Int64') - expected_labels = np.array([0, 2, 1], dtype=np.intp) + expected_labels = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) tm.assert_numpy_array_equal(labels, expected_labels) From 13f6706e83760ae3f1f04fe8dce95b9b17bc7fec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Apr 2019 10:18:09 +0200 Subject: [PATCH 08/10] update factorize test for EAs with custom na_sentinel (which now works) + add whatsnew --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/tests/test_algos.py | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d2897afa762b1..aecabbb3d10ae 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -408,7 +408,7 @@ Other ^^^^^ - Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`) -- +- Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). - diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b64786de264cd..4bc83bbd09ff1 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -24,6 +24,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray import pandas.core.common as com +from pandas.core.sorting import safe_sort import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal @@ -327,18 +328,26 @@ def test_parametrized_factorize_na_value(self, data, na_value): @pytest.mark.parametrize('sort', [True, False]) @pytest.mark.parametrize('na_sentinel', [-1, -10, 100]) - def test_factorize_na_sentinel(self, sort, na_sentinel): - data = np.array(['b', 'a', None, 'b'], dtype=object) + @pytest.mark.parametrize('data, uniques', [ + (np.array(['b', 'a', None, 'b'], dtype=object), + np.array(['b', 'a'], dtype=object)), + (pd.array([2, 1, np.nan, 2], dtype='Int64'), + pd.array([2, 1], dtype='Int64'))], + ids=['numpy_array', 'extension_array']) + def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp) - expected_uniques = np.array(['a', 'b'], dtype=object) + expected_uniques = safe_sort(uniques) else: expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp) - expected_uniques = np.array(['b', 'a'], dtype=object) + expected_uniques = uniques tm.assert_numpy_array_equal(labels, expected_labels) - tm.assert_numpy_array_equal(uniques, expected_uniques) + if isinstance(data, np.ndarray): + tm.assert_numpy_array_equal(uniques, expected_uniques) + else: + tm.assert_extension_array_equal(uniques, expected_uniques) class TestUnique(object): From e350641dbb6cced71507109aabc4d57fd5fa3969 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 May 2019 20:13:08 +0200 Subject: [PATCH 09/10] linting --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 79b164406a218..21c0c8f747b10 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -404,7 +404,7 @@ def _reorder_by_uniques(uniques, labels): return uniques, labels -def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True): """ Sort ``values`` and reorder corresponding ``labels``. From 151aa6ae5d3584042f21b9a3438d8f2ce2453c13 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 May 2019 20:52:12 +0200 Subject: [PATCH 10/10] more linting --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d85cf07e36298..2784b9299e447 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -428,7 +428,7 @@ Other ^^^^^ - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`) -- Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). +- Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). .. _whatsnew_0.250.contributors: