From 0ec36001a4284c21b7830ade629ce4544f426f2c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 19 Feb 2018 15:12:55 -0600 Subject: [PATCH 01/47] ENH: Sorting of ExtensionArrays This enables {Series,DataFrame}.sort_values and {Series,DataFrame}.argsort --- pandas/core/arrays/base.py | 21 +++++++++ pandas/tests/extension/base/methods.py | 40 +++++++++++++++++ .../extension/category/test_categorical.py | 12 ++++++ pandas/tests/extension/conftest.py | 20 +++++++++ .../tests/extension/decimal/test_decimal.py | 43 +++++++++++++++---- pandas/tests/extension/json/test_json.py | 30 +++++++++++++ 6 files changed, 158 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cec881394a021..8a49b673c4145 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -216,6 +216,27 @@ def isna(self): """ raise AbstractMethodError(self) + def argsort(self, axis=-1, kind='quicksort', order=None): + """Returns the indices that would sort this array. + + Parameters + ---------- + axis : int or None, optional + Axis along which to sort. ExtensionArrays are 1-dimensional, + so this is only included for compatibility with NumPy. + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + Sorting algorithm. + order : str or list of str, optional + Included for NumPy compatibility. + + Returns + ------- + index_array : ndarray + Array of indices that sort ``self``. + + """ + return np.array(self).argsort(kind=kind) + # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 74e5d180b1aa3..5f4604f8f17af 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -31,3 +31,43 @@ def test_count(self, data_missing): def test_apply_simple_series(self, data): result = pd.Series(data).apply(id) assert isinstance(result, pd.Series) + + def test_argsort(self, data_for_sorting): + result = pd.Series(data_for_sorting).argsort() + expected = pd.Series(np.array([2, 0, 1])) + self.assert_series_equal(result, expected) + + def test_argsort_missing(self, data_missing_for_sorting): + result = pd.Series(data_missing_for_sorting).argsort() + expected = pd.Series(np.array([1, -1, 0])) + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values(self, data_for_sorting, ascending): + ser = pd.Series(data_for_sorting) + result = ser.sort_values(ascending=ascending) + expected = ser.iloc[[2, 0, 1]] + if not ascending: + expected = expected[::-1] + + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values_missing(self, data_missing_for_sorting, ascending): + ser = pd.Series(data_missing_for_sorting) + result = ser.sort_values(ascending=ascending) + if ascending: + expected = ser.iloc[[2, 0, 1]] + else: + expected = ser.iloc[[0, 2, 1]] + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values_frame(self, data_for_sorting, ascending): + df = pd.DataFrame({"A": [1, 2, 1], + "B": data_for_sorting}) + result = df.sort_values(['A', 'B']) + expected = pd.DataFrame({"A": [1, 1, 2], + 'B': data_for_sorting.take([2, 0, 1])}, + index=[2, 0, 1]) + self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 8f413b4a19730..2400e7654e294 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -29,6 +29,18 @@ def data_missing(): return Categorical([np.nan, 'A']) +@pytest.fixture +def data_for_sorting(): + return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'], + ordered=True) + + +@pytest.fixture +def data_missing_for_sorting(): + return Categorical(['A', None, 'B'], categories=['B', 'A'], + ordered=True) + + @pytest.fixture def na_value(): return np.nan diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 21ed8894e8ebb..04dfb408fc378 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -30,6 +30,26 @@ def all_data(request, data, data_missing): return data_missing +@pytest.fixture +def data_for_sorting(): + """Length-3 array with a known sort order. + + This should be three items [B, C, A] with + A < B < C + """ + raise NotImplementedError + + +@pytest.fixture +def data_missing_for_sorting(): + """Length-3 array with a known sort order. + + This should be three items [B, NA, A] with + A < B and NA missing. + """ + raise NotImplementedError + + @pytest.fixture def na_cmp(): """Binary operator for comparing NA values. diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 7b4d079ecad87..ca496a014651c 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -25,6 +25,20 @@ def data_missing(): return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) +@pytest.fixture +def data_for_sorting(): + return DecimalArray([decimal.Decimal('1'), + decimal.Decimal('2'), + decimal.Decimal('0')]) + + +@pytest.fixture +def data_missing_for_sorting(): + return DecimalArray([decimal.Decimal('1'), + decimal.Decimal('NaN'), + decimal.Decimal('0')]) + + @pytest.fixture def na_cmp(): return lambda x, y: x.is_nan() and y.is_nan() @@ -35,19 +49,32 @@ def na_value(): return decimal.Decimal("NaN") -class TestDtype(base.BaseDtypeTests): +class BaseDecimal(object): + @staticmethod + def assert_series_equal(left, right, *args, **kwargs): + + left_na = left.isna() + right_na = right.isna() + + tm.assert_series_equal(left_na, right_na) + return tm.assert_series_equal(left[~left_na], + right[~right_na], + *args, **kwargs) + + +class TestDtype(BaseDecimal, base.BaseDtypeTests): pass -class TestInterface(base.BaseInterfaceTests): +class TestInterface(BaseDecimal, base.BaseInterfaceTests): pass -class TestConstructors(base.BaseConstructorsTests): +class TestConstructors(BaseDecimal, base.BaseConstructorsTests): pass -class TestReshaping(base.BaseReshapingTests): +class TestReshaping(BaseDecimal, base.BaseReshapingTests): def test_align(self, data, na_value): # Have to override since assert_series_equal doesn't @@ -88,15 +115,15 @@ def test_align_frame(self, data, na_value): assert e2.loc[0, 'A'].is_nan() -class TestGetitem(base.BaseGetitemTests): +class TestGetitem(BaseDecimal, base.BaseGetitemTests): pass -class TestMissing(base.BaseMissingTests): +class TestMissing(BaseDecimal, base.BaseMissingTests): pass -class TestMethods(base.BaseMethodsTests): +class TestMethods(BaseDecimal, base.BaseMethodsTests): @pytest.mark.parametrize('dropna', [True, False]) @pytest.mark.xfail(reason="value_counts not implemented yet.") def test_value_counts(self, all_data, dropna): @@ -112,7 +139,7 @@ def test_value_counts(self, all_data, dropna): tm.assert_series_equal(result, expected) -class TestCasting(base.BaseCastingTests): +class TestCasting(BaseDecimal, base.BaseCastingTests): pass diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index e0721bb1d8d1a..8356adf42c6d7 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -29,6 +29,16 @@ def data_missing(): return JSONArray([{}, {'a': 10}]) +@pytest.fixture +def data_for_sorting(): + return JSONArray([{'b': 1}, {'c': 4}, {'a': 2, 'c': 3}]) + + +@pytest.fixture +def data_missing_for_sorting(): + return JSONArray([{'b': 1}, {}, {'c': 4}]) + + @pytest.fixture def na_value(): return {} @@ -68,6 +78,26 @@ class TestMethods(base.BaseMethodsTests): def test_value_counts(self, all_data, dropna): pass + @pytest.mark.skip(reason="Dictionaries are not orderable.") + def test_argsort(self): + pass + + @pytest.mark.skip(reason="Dictionaries are not orderable.") + def test_argsort_missing(self): + pass + + @pytest.mark.skip(reason="Dictionaries are not orderable.") + def test_sort_values(self): + pass + + @pytest.mark.skip(reason="Dictionaries are not orderable.") + def test_sort_values_missing(self): + pass + + @pytest.mark.skip(reason="Dictionaries are not orderable.") + def test_sort_values_frame(self): + pass + class TestCasting(base.BaseCastingTests): pass From 47072735cc68a1538fb1ef953dd359056e5c5af9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 06:29:01 -0600 Subject: [PATCH 02/47] REF: Split argsort into two parts --- pandas/core/arrays/base.py | 29 ++++++++++++++++++++++++++--- pandas/core/arrays/categorical.py | 11 ++++++----- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8a49b673c4145..01ebf9b95d71a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2,6 +2,7 @@ import numpy as np from pandas.errors import AbstractMethodError +from pandas.compat.numpy import function as nv _not_implemented_message = "{} does not implement {}." @@ -216,11 +217,24 @@ def isna(self): """ raise AbstractMethodError(self) - def argsort(self, axis=-1, kind='quicksort', order=None): + def _values_for_argsort(self): + # type: () -> ndarray + """Get the ndarray to be passed to np.argsort. + + This is called from within 'ExtensionArray.argsort'. + + Returns + ------- + values : ndarray + """ + return np.array(self) + + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """Returns the indices that would sort this array. Parameters ---------- + ascending : bool, default True axis : int or None, optional Axis along which to sort. ExtensionArrays are 1-dimensional, so this is only included for compatibility with NumPy. @@ -233,9 +247,18 @@ def argsort(self, axis=-1, kind='quicksort', order=None): ------- index_array : ndarray Array of indices that sort ``self``. - """ - return np.array(self).argsort(kind=kind) + # Implementor note: You have two places to override the behavior of + # argsort. + # 1. _values_for_argsort : construct the values passed to np.argsort + # 2. argsort : total control over sorting. + + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + values = self._values_for_argsort() + result = np.argsort(values, kind=kind, **kwargs) + if not ascending: + result = result[::-1] + return result # ------------------------------------------------------------------------ # Indexing methods diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c6eeabf0148d0..d2a357fa22d55 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1386,6 +1386,9 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) + def _values_for_argsort(self): + return self._codes.copy() + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ Returns the indices that would sort the Categorical instance if @@ -1406,11 +1409,9 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): -------- numpy.ndarray.argsort """ - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - result = np.argsort(self._codes.copy(), kind=kind, **kwargs) - if not ascending: - result = result[::-1] - return result + # Keep the implementation here just for the docstring. + return super(Categorical, self).argsort(ascending=ascending, kind=kind, + *args, **kwargs) def sort_values(self, inplace=False, ascending=True, na_position='last'): """ Sorts the Categorical by category value returning a new From b61fb8d5971d1accc2f306ae9f2cef986e885f20 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 10:03:57 -0600 Subject: [PATCH 03/47] Fixed docstring --- pandas/core/arrays/base.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 01ebf9b95d71a..479e821586067 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -235,18 +235,21 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): Parameters ---------- ascending : bool, default True - axis : int or None, optional - Axis along which to sort. ExtensionArrays are 1-dimensional, - so this is only included for compatibility with NumPy. + Whether the indices should result in an ascending + or descending sort. kind : {'quicksort', 'mergesort', 'heapsort'}, optional Sorting algorithm. - order : str or list of str, optional - Included for NumPy compatibility. + args, kwargs: + passed through to :func:`numpy.argsort`. Returns ------- index_array : ndarray Array of indices that sort ``self``. + + See Also + -------- + numpy.argsort """ # Implementor note: You have two places to override the behavior of # argsort. From 44b6d72be5b2431bbe38454a7fc7439244575925 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 13:23:18 -0600 Subject: [PATCH 04/47] Remove _values_for_argsort --- pandas/core/arrays/base.py | 21 ++------------------- pandas/core/arrays/categorical.py | 11 +++++------ 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 479e821586067..cfc9d54c31c55 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -217,20 +217,8 @@ def isna(self): """ raise AbstractMethodError(self) - def _values_for_argsort(self): - # type: () -> ndarray - """Get the ndarray to be passed to np.argsort. - - This is called from within 'ExtensionArray.argsort'. - - Returns - ------- - values : ndarray - """ - return np.array(self) - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - """Returns the indices that would sort this array. + """Return the indices that would sort this array. Parameters ---------- @@ -251,13 +239,8 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): -------- numpy.argsort """ - # Implementor note: You have two places to override the behavior of - # argsort. - # 1. _values_for_argsort : construct the values passed to np.argsort - # 2. argsort : total control over sorting. - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._values_for_argsort() + values = self.astype(object) result = np.argsort(values, kind=kind, **kwargs) if not ascending: result = result[::-1] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d2a357fa22d55..c6eeabf0148d0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1386,9 +1386,6 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) - def _values_for_argsort(self): - return self._codes.copy() - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ Returns the indices that would sort the Categorical instance if @@ -1409,9 +1406,11 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): -------- numpy.ndarray.argsort """ - # Keep the implementation here just for the docstring. - return super(Categorical, self).argsort(ascending=ascending, kind=kind, - *args, **kwargs) + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + result = np.argsort(self._codes.copy(), kind=kind, **kwargs) + if not ascending: + result = result[::-1] + return result def sort_values(self, inplace=False, ascending=True, na_position='last'): """ Sorts the Categorical by category value returning a new From 5be3917db210ff5f8c644e11411597d190aa4270 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 15:12:15 -0600 Subject: [PATCH 05/47] Revert "Remove _values_for_argsort" This reverts commit 44b6d72be5b2431bbe38454a7fc7439244575925. --- pandas/core/arrays/base.py | 21 +++++++++++++++++++-- pandas/core/arrays/categorical.py | 11 ++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cfc9d54c31c55..479e821586067 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -217,8 +217,20 @@ def isna(self): """ raise AbstractMethodError(self) + def _values_for_argsort(self): + # type: () -> ndarray + """Get the ndarray to be passed to np.argsort. + + This is called from within 'ExtensionArray.argsort'. + + Returns + ------- + values : ndarray + """ + return np.array(self) + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - """Return the indices that would sort this array. + """Returns the indices that would sort this array. Parameters ---------- @@ -239,8 +251,13 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): -------- numpy.argsort """ + # Implementor note: You have two places to override the behavior of + # argsort. + # 1. _values_for_argsort : construct the values passed to np.argsort + # 2. argsort : total control over sorting. + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self.astype(object) + values = self._values_for_argsort() result = np.argsort(values, kind=kind, **kwargs) if not ascending: result = result[::-1] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c6eeabf0148d0..d2a357fa22d55 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1386,6 +1386,9 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) + def _values_for_argsort(self): + return self._codes.copy() + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ Returns the indices that would sort the Categorical instance if @@ -1406,11 +1409,9 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): -------- numpy.ndarray.argsort """ - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - result = np.argsort(self._codes.copy(), kind=kind, **kwargs) - if not ascending: - result = result[::-1] - return result + # Keep the implementation here just for the docstring. + return super(Categorical, self).argsort(ascending=ascending, kind=kind, + *args, **kwargs) def sort_values(self, inplace=False, ascending=True, na_position='last'): """ Sorts the Categorical by category value returning a new From c2578c3942d37ea88dbddf8324a9e75c1f4fe637 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 16:28:19 -0600 Subject: [PATCH 06/47] Workaround Py2 --- pandas/core/arrays/categorical.py | 46 +++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e98dbb750884a..dcb6945a196fa 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1393,17 +1393,21 @@ def check_for_ordered(self, op): def _values_for_argsort(self): return self._codes.copy() - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - """ - Returns the indices that would sort the Categorical instance if - 'sort_values' was called. This function is implemented to provide - compatibility with numpy ndarray objects. + def argsort(self, *args, **kwargs): + # TODO(PY2): use correct signature + # We have to do *args, **kwargs to avoid a a py2-only signature + # issue since np.argsort differs from argsort. + """Return the indicies that would sort the Categorical. - While an ordering is applied to the category values, arg-sorting - in this context refers more to organizing and grouping together - based on matching category values. Thus, this function can be - called on an unordered Categorical instance unlike the functions - 'Categorical.min' and 'Categorical.max'. + Parameters + ---------- + ascending : bool, default True + Whether the indices should result in an ascending + or descending sort. + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + Sorting algorithm. + args, kwargs: + passed through to :func:`numpy.argsort`. Returns ------- @@ -1412,10 +1416,28 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): See also -------- numpy.ndarray.argsort + + Notes + ----- + While an ordering is applied to the category values, arg-sorting + in this context refers more to organizing and grouping together + based on matching category values. Thus, this function can be + called on an unordered Categorical instance unlike the functions + 'Categorical.min' and 'Categorical.max'. + + Examples + -------- + >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() + array([2, 0, 1, 3]) + + >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], + ... categories=['c', 'b', 'a'], + ... ordered=True) + >>> cat.argsort() + array([3, 0, 1, 2]) """ # Keep the implementation here just for the docstring. - return super(Categorical, self).argsort(ascending=ascending, kind=kind, - *args, **kwargs) + return super(Categorical, self).argsort(*args, **kwargs) def sort_values(self, inplace=False, ascending=True, na_position='last'): """ Sorts the Categorical by category value returning a new From b73e303b247f19c3fa7849fa9a37e44e63e7a64f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Mar 2018 16:31:01 -0600 Subject: [PATCH 07/47] Indexer as array --- pandas/tests/extension/decimal/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 736556e4be20d..f1852542088ff 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -71,6 +71,7 @@ def isna(self): return np.array([x.is_nan() for x in self.values]) def take(self, indexer, allow_fill=True, fill_value=None): + indexer = np.asarray(indexer) mask = indexer == -1 indexer = _ensure_platform_int(indexer) From 0db9e97a3f60114f757b0cb9a849749f26308fbb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 4 Mar 2018 06:58:01 -0600 Subject: [PATCH 08/47] Fixed dtypes --- pandas/tests/extension/base/methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 5f4604f8f17af..d3348549db0bd 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -34,12 +34,12 @@ def test_apply_simple_series(self, data): def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() - expected = pd.Series(np.array([2, 0, 1])) + expected = pd.Series(np.array([2, 0, 1], dtype=np.int64)) self.assert_series_equal(result, expected) def test_argsort_missing(self, data_missing_for_sorting): result = pd.Series(data_missing_for_sorting).argsort() - expected = pd.Series(np.array([1, -1, 0])) + expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) self.assert_series_equal(result, expected) @pytest.mark.parametrize('ascending', [True, False]) From baf624c1148393bcb42eda4c6ad3c1270963258d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Mar 2018 09:05:41 -0500 Subject: [PATCH 09/47] Fixed docstring --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index dcb6945a196fa..4b1238f4fbf05 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1406,7 +1406,7 @@ def argsort(self, *args, **kwargs): or descending sort. kind : {'quicksort', 'mergesort', 'heapsort'}, optional Sorting algorithm. - args, kwargs: + *args, **kwargs: passed through to :func:`numpy.argsort`. Returns From 7bbe796f45331444d7c3f038e2bcbf16b9f6247f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Mar 2018 21:14:30 -0500 Subject: [PATCH 10/47] Update docs --- pandas/core/arrays/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7f78ca6e29da2..322f71ad83580 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -250,7 +250,8 @@ def _values_for_argsort(self): return np.array(self) def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - """Returns the indices that would sort this array. + """ + Return the indices that would sort this array. Parameters ---------- @@ -259,7 +260,7 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): or descending sort. kind : {'quicksort', 'mergesort', 'heapsort'}, optional Sorting algorithm. - args, kwargs: + *args, **kwargs: passed through to :func:`numpy.argsort`. Returns @@ -269,7 +270,7 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): See Also -------- - numpy.argsort + numpy.argsort : Sorting implementation used internally. """ # Implementor note: You have two places to override the behavior of # argsort. From 31ed4c9f6d571c8989d8ca19cb2afa084d55eb11 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Mar 2018 11:26:33 -0500 Subject: [PATCH 11/47] ENH/API: ExtensionArray.factorize Adds factorize to the interface for ExtensionArray, with a default implementation. This is a stepping stone to groupby. --- pandas/core/algorithms.py | 7 ++-- pandas/core/arrays/base.py | 35 +++++++++++++++++++ pandas/tests/extension/base/base.py | 3 ++ pandas/tests/extension/base/methods.py | 20 +++++++++++ .../extension/category/test_categorical.py | 5 +++ pandas/tests/extension/conftest.py | 11 ++++++ .../tests/extension/decimal/test_decimal.py | 9 +++++ pandas/tests/extension/json/array.py | 16 +++++++++ pandas/tests/extension/json/test_json.py | 17 +++++++-- pandas/util/testing.py | 27 ++++++++++++++ 10 files changed, 144 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index de2e638265f1e..adb09f748cbaa 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -146,10 +146,9 @@ def _reconstruct_data(values, dtype, original): Returns ------- Index for extension types, otherwise ndarray casted to dtype - """ from pandas import Index - if is_categorical_dtype(dtype): + if is_extension_array_dtype(dtype): pass elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype): values = Index(original)._shallow_copy(values, name=None) @@ -502,9 +501,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): values = _ensure_arraylike(values) original = values - if is_categorical_dtype(values): + if is_extension_array_dtype(values): values = getattr(values, '_values', values) - labels, uniques = values.factorize() + labels, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: values, dtype, _ = _ensure_data(values) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1f33081a5f610..f65352d815eb2 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -248,6 +248,41 @@ def unique(self): uniques = unique(self.astype(object)) return self._constructor_from_sequence(uniques) + def factorize(self, na_sentinel=-1): + """Encode the extension array as an enumerated type. + + Parameters + ---------- + na_sentinel : int, default -1 + Value to use in the `labels` array to indicate missing values. + + Returns + ------- + labels : ndarray + An interger NumPy array that's an indexer into the original + ExtensionArray + uniques : ExtensionArray + An ExtensionArray containing the unique values of `self`. + + See Also + -------- + pandas.factorize : top-level factorize method that dispatches here. + + Notes + ----- + :meth:`pandas.factorize` offers a `sort` keyword as well. + """ + from pandas.core.algorithms import _factorize_array + + mask = self.isna() + arr = self.astype(object) + arr[mask] = np.nan + + labels, uniques = _factorize_array(arr, check_nulls=True, + na_sentinel=na_sentinel) + uniques = self._constructor_from_sequence(uniques) + return labels, uniques + # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ diff --git a/pandas/tests/extension/base/base.py b/pandas/tests/extension/base/base.py index d29587e635ebd..beb7948f2c14b 100644 --- a/pandas/tests/extension/base/base.py +++ b/pandas/tests/extension/base/base.py @@ -4,3 +4,6 @@ class BaseExtensionTests(object): assert_series_equal = staticmethod(tm.assert_series_equal) assert_frame_equal = staticmethod(tm.assert_frame_equal) + assert_extension_array_equal = staticmethod( + tm.assert_extension_array_equal + ) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 7ce80e25d8cf6..bb683f41b34c6 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +import pandas.util.testing as tm from .base import BaseExtensionTests @@ -42,3 +43,22 @@ def test_unique(self, data, box, method): assert len(result) == 1 assert isinstance(result, type(data)) assert result[0] == duplicated[0] + + @pytest.mark.parametrize('na_sentinel', [-1, -2]) + def test_factorize(self, data_for_grouping, na_sentinel): + labels, uniques = pd.factorize(data_for_grouping, + na_sentinel=na_sentinel) + expected_labels = np.array([0, 0, na_sentinel, + na_sentinel, 1, 1, 0, 2], + dtype='int64') + expected_uniques = data_for_grouping.take([0, 4, 7]) + + tm.assert_numpy_array_equal(labels, expected_labels) + self.assert_extension_array_equal(uniques, expected_uniques) + + def test_factorize_equivalence(self, data_for_grouping): + l1, u1 = pd.factorize(data_for_grouping) + l2, u2 = pd.factorize(data_for_grouping) + + tm.assert_numpy_array_equal(l1, l2) + self.assert_extension_array_equal(u1, u2) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 8f413b4a19730..343e29523b6fa 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -34,6 +34,11 @@ def na_value(): return np.nan +@pytest.fixture +def data_for_grouping(): + return Categorical(['a', 'a', None, None, 'b', 'b', 'a', 'c']) + + class TestDtype(base.BaseDtypeTests): pass diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 21ed8894e8ebb..9904fd7118d27 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -46,3 +46,14 @@ def na_cmp(): def na_value(): """The scalar missing value for this type. Default 'None'""" return None + + +@pytest.fixture +def data_for_grouping(): + """Data for factorization, grouping, and unique tests. + + Expected to be like [B, B, NA, NA, A, A, B, C] + + Where A < B < C and NA is missing + """ + raise NotImplementedError diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 7b4d079ecad87..7026f33674f94 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -35,6 +35,15 @@ def na_value(): return decimal.Decimal("NaN") +@pytest.fixture +def data_for_grouping(): + b = decimal.Decimal('1.0') + a = decimal.Decimal('0.0') + c = decimal.Decimal('2.0') + na = decimal.Decimal('NaN') + return DecimalArray([b, b, na, na, a, a, b, c]) + + class TestDtype(base.BaseDtypeTests): pass diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 322944129146a..d885159e1ed4a 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -7,6 +7,7 @@ import numpy as np +import pandas as pd from pandas.core.dtypes.base import ExtensionDtype from pandas.core.arrays import ExtensionArray @@ -104,6 +105,21 @@ def _concat_same_type(cls, to_concat): data = list(itertools.chain.from_iterable([x.data for x in to_concat])) return cls(data) + def factorize(self, na_sentinel=-1): + frozen = tuple(tuple(x.items()) for x in self) + labels, uniques = pd.factorize(frozen) + + # fixup NA + if self.isna().any(): + na_code = self.isna().argmax() + + labels[labels == na_code] = na_sentinel + labels[labels > na_code] -= 1 + + uniques = JSONArray([collections.UserDict(x) + for x in uniques if x != ()]) + return labels, uniques + def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index e0721bb1d8d1a..1eecba2ae16b5 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -39,6 +39,17 @@ def na_cmp(): return operator.eq +@pytest.fixture +def data_for_grouping(): + return JSONArray([ + {'b': 1}, {'b': 1}, + {}, {}, + {'a': 0, 'c': 2}, {'a': 0, 'c': 2}, + {'b': 1}, + {'c': 2}, + ]) + + class TestDtype(base.BaseDtypeTests): pass @@ -64,8 +75,10 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="Unhashable") - def test_value_counts(self, all_data, dropna): + unhashable = pytest.mark.skip(reason="Unhashable") + + @unhashable + def test_factorize(self): pass diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a223e4d8fd23e..ff277c0cea10a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -20,6 +20,7 @@ import numpy as np import pandas as pd +from pandas.core.arrays.base import ExtensionArray from pandas.core.dtypes.missing import array_equivalent from pandas.core.dtypes.common import ( is_datetimelike_v_numeric, @@ -1083,6 +1084,32 @@ def _raise(left, right, err_msg): return True +def assert_extension_array_equal(left, right): + """Check that left and right ExtensionArrays are equal. + + Parameters + ---------- + left, right : ExtensionArray + The two arrays to compare + + Notes + ----- + Missing values are checked separately from valid values. + A mask of missing values is computed for each and checked to match. + The remaining all-valid values are cast to object dtype and checked. + """ + assert isinstance(left, ExtensionArray) + assert left.dtype == right.dtype + left_na = left.isna() + right_na = right.isna() + assert_numpy_array_equal(left_na, right_na) + + left_valid = left[~left_na].astype(object) + right_valid = right[~right_na].astype(object) + + assert_numpy_array_equal(left_valid, right_valid) + + # This could be refactored to use the NDFrame.equals method def assert_series_equal(left, right, check_dtype=True, check_index_type='equiv', From 434df7dc9c927ac52b881e748ebee12c1389cf15 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Mar 2018 07:20:27 -0500 Subject: [PATCH 12/47] fixup! ENH/API: ExtensionArray.factorize --- pandas/tests/extension/json/array.py | 2 +- pandas/tests/extension/json/test_json.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index d885159e1ed4a..fcfde3911b808 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -111,7 +111,7 @@ def factorize(self, na_sentinel=-1): # fixup NA if self.isna().any(): - na_code = self.isna().argmax() + na_code = labels[self.isna()][0] labels[labels == na_code] = na_sentinel labels[labels > na_code] -= 1 diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 1eecba2ae16b5..16e78cd9ba8ef 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -75,10 +75,8 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - unhashable = pytest.mark.skip(reason="Unhashable") - - @unhashable - def test_factorize(self): + @pytest.mark.skip(reason="Unhashable") + def test_value_counts(self): pass From 505ad449f9b34c8e662739b9ea80b3aaca4ebb39 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Mar 2018 08:47:25 -0500 Subject: [PATCH 13/47] REF: Changed ExtensionDtype inheritance `is_extension_array_dtype(dtype)` was incorrect for dtypes that haven't implemented the new interface yet. This is because they indirectly subclassed ExtensionDtype. This PR changes the hierarchy so that PandasExtensionDtype doesn't subclass ExtensionDtype. As we implement the interface, like Categorical, we'll add ExtensionDtype as a base class. Before: ``` DatetimeTZDtype <- PandasExtensionDtype <- ExtensionDtype (wrong) CategoricalDtype <- PandasExtensionDtype <- ExtensionDtype (right) After: DatetimeTZDtype <- PandasExtensionDtype \ - _DtypeOpsMixin / ExtensionDtype ------ CategoricalDtype - PandasExtensionDtype - \ \ \ -_DtypeOpsMixin \ / ExtensionDtype ------- ``` Once all our extension dtypes have implemented the interface we can go back to the simple, linear inheritance structure. --- pandas/core/dtypes/base.py | 144 +++++++++++++++++++---------------- pandas/core/dtypes/dtypes.py | 6 +- 2 files changed, 81 insertions(+), 69 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index d54d980d02ffa..6dbed5f138d5d 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -5,26 +5,16 @@ from pandas.errors import AbstractMethodError -class ExtensionDtype(object): - """A custom data type, to be paired with an ExtensionArray. - - Notes - ----- - The interface includes the following abstract methods that must - be implemented by subclasses: - - * type - * name - * construct_from_string - - This class does not inherit from 'abc.ABCMeta' for performance reasons. - Methods and properties required by the interface raise - ``pandas.errors.AbstractMethodError`` and no ``register`` method is - provided for registering virtual subclasses. - """ - - def __str__(self): - return self.name +class _DtypeOpsMixin(object): + # Not all of pandas' extension dtypes are compatibile with + # the new ExtensionArray interface. This means PandasExtensionDtype + # can't subclass ExtensionDtype yet, as is_extension_array_dtype would + # incorrectly say that these types are extension types. + # + # In the interim, we put methods that are shared between the two base + # classes ExtensionDtype and PandasExtensionDtype here. Both those base + # classes will inherit from this Mixin. Once everything is compatible, this + # class's methods can be moved to ExtensionDtype and removed. def __eq__(self, other): """Check whether 'other' is equal to self. @@ -52,6 +42,74 @@ def __eq__(self, other): def __ne__(self, other): return not self.__eq__(other) + @property + def names(self): + # type: () -> Optional[List[str]] + """Ordered list of field names, or None if there are no fields. + + This is for compatibility with NumPy arrays, and may be removed in the + future. + """ + return None + + @classmethod + def is_dtype(cls, dtype): + """Check if we match 'dtype'. + + Parameters + ---------- + dtype : object + The object to check. + + Returns + ------- + is_dtype : bool + + Notes + ----- + The default implementation is True if + + 1. ``cls.construct_from_string(dtype)`` is an instance + of ``cls``. + 2. ``dtype`` is an object and is an instance of ``cls`` + 3. ``dtype`` has a ``dtype`` attribute, and any of the above + conditions is true for ``dtype.dtype``. + """ + dtype = getattr(dtype, 'dtype', dtype) + + if isinstance(dtype, np.dtype): + return False + elif dtype is None: + return False + elif isinstance(dtype, cls): + return True + try: + return cls.construct_from_string(dtype) is not None + except TypeError: + return False + + +class ExtensionDtype(_DtypeOpsMixin): + """A custom data type, to be paired with an ExtensionArray. + + Notes + ----- + The interface includes the following abstract methods that must + be implemented by subclasses: + + * type + * name + * construct_from_string + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. + """ + + def __str__(self): + return self.name + @property def type(self): # type: () -> type @@ -87,16 +145,6 @@ def name(self): """ raise AbstractMethodError(self) - @property - def names(self): - # type: () -> Optional[List[str]] - """Ordered list of field names, or None if there are no fields. - - This is for compatibility with NumPy arrays, and may be removed in the - future. - """ - return None - @classmethod def construct_from_string(cls, string): """Attempt to construct this type from a string. @@ -128,39 +176,3 @@ def construct_from_string(cls, string): ... "'{}'".format(cls, string)) """ raise AbstractMethodError(cls) - - @classmethod - def is_dtype(cls, dtype): - """Check if we match 'dtype'. - - Parameters - ---------- - dtype : object - The object to check. - - Returns - ------- - is_dtype : bool - - Notes - ----- - The default implementation is True if - - 1. ``cls.construct_from_string(dtype)`` is an instance - of ``cls``. - 2. ``dtype`` is an object and is an instance of ``cls`` - 3. ``dtype`` has a ``dtype`` attribute, and any of the above - conditions is true for ``dtype.dtype``. - """ - dtype = getattr(dtype, 'dtype', dtype) - - if isinstance(dtype, np.dtype): - return False - elif dtype is None: - return False - elif isinstance(dtype, cls): - return True - try: - return cls.construct_from_string(dtype) is not None - except TypeError: - return False diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index d262a71933915..708f54f5ca75b 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -5,10 +5,10 @@ from pandas import compat from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex -from .base import ExtensionDtype +from .base import ExtensionDtype, _DtypeOpsMixin -class PandasExtensionDtype(ExtensionDtype): +class PandasExtensionDtype(_DtypeOpsMixin): """ A np.dtype duck-typed class, suitable for holding a custom dtype. @@ -83,7 +83,7 @@ class CategoricalDtypeType(type): pass -class CategoricalDtype(PandasExtensionDtype): +class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): """ Type for categorical data with the categories and orderedness From b59656f33b2b4b324fea35b5a9ae26e3efa4b90f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Mar 2018 13:37:43 -0500 Subject: [PATCH 14/47] Fix factorize equivalence test --- pandas/tests/extension/base/methods.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index bb683f41b34c6..577fbbaf477c9 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -56,9 +56,10 @@ def test_factorize(self, data_for_grouping, na_sentinel): tm.assert_numpy_array_equal(labels, expected_labels) self.assert_extension_array_equal(uniques, expected_uniques) - def test_factorize_equivalence(self, data_for_grouping): - l1, u1 = pd.factorize(data_for_grouping) - l2, u2 = pd.factorize(data_for_grouping) + @pytest.mark.parametrize('na_sentinel', [-1, -2]) + def test_factorize_equivalence(self, data_for_grouping, na_sentinel): + l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel) tm.assert_numpy_array_equal(l1, l2) self.assert_extension_array_equal(u1, u2) From 201e029b34e06e0c3144b086063327b94b1504cb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Mar 2018 14:24:27 -0500 Subject: [PATCH 15/47] Shared factorize doc --- pandas/core/algorithms.py | 45 ++++++----- pandas/core/arrays/base.py | 9 ++- pandas/core/arrays/categorical.py | 42 +---------- pandas/core/base.py | 119 ++++++++++++++++++++++++++---- 4 files changed, 135 insertions(+), 80 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index adb09f748cbaa..00ca82bb1c49d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -4,8 +4,11 @@ """ from __future__ import division from warnings import warn, catch_warnings +from textwrap import dedent + import numpy as np +from pandas.core.base import _shared_docs from pandas.core.dtypes.cast import ( maybe_promote, construct_1d_object_array_from_listlike) from pandas.core.dtypes.generic import ( @@ -34,7 +37,8 @@ from pandas.core import common as com from pandas._libs import algos, lib, hashtable as htable from pandas._libs.tslib import iNaT -from pandas.util._decorators import deprecate_kwarg +from pandas.util._decorators import (Appender, Substitution, + deprecate_kwarg) # --------------- # @@ -463,32 +467,25 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): return labels, uniques -@deprecate_kwarg(old_arg_name='order', new_arg_name=None) -def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): - """ - Encode input values as an enumerated type or categorical variable - - Parameters - ---------- +@Substitution( + values=dedent("""\ values : Sequence - ndarrays must be 1-D. Sequences that aren't pandas objects are + A 1-D seqeunce. Sequences that aren't pandas objects are coereced to ndarrays before factorization. + """), + sort=dedent("""\ sort : boolean, default False - Sort by values - na_sentinel : int, default -1 - Value to mark "not found" - size_hint : hint to the hashtable sizer - - Returns - ------- - labels : the indexer to the original array - uniques : ndarray (1-d) or Index - the unique values. Index is returned when passed values is Index or - Series - - note: an array of Periods will ignore sort as it returns an always sorted - PeriodIndex. - """ + Sort `uniques` and shuffle `labels` to maintain the + relationship. + """), + size_hint=dedent("""\ + size_hint : int, optional + Hint to the hashtable sizer. + """), +) +@Appender(_shared_docs['factorize']) +@deprecate_kwarg(old_arg_name='order', new_arg_name=None) +def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) # 2.) factorizing labels and uniques diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f65352d815eb2..1b07d217f141d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -260,10 +260,15 @@ def factorize(self, na_sentinel=-1): ------- labels : ndarray An interger NumPy array that's an indexer into the original - ExtensionArray + ExtensionArray. uniques : ExtensionArray An ExtensionArray containing the unique values of `self`. + .. note:: + + uniques should *not* contain a value for the NA sentinel, + if values in `self` are missing. + See Also -------- pandas.factorize : top-level factorize method that dispatches here. @@ -272,6 +277,8 @@ def factorize(self, na_sentinel=-1): ----- :meth:`pandas.factorize` offers a `sort` keyword as well. """ + # Implementor note: make sure to exclude missing values from your + # `uniques`. It should only contain non-NA values. from pandas.core.algorithms import _factorize_array mask = self.isna() diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c6c46956a6eaf..ed73b27b4008c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2045,47 +2045,9 @@ def unique(self): take_codes = sorted(take_codes) return cat.set_categories(cat.categories.take(take_codes)) + @Substitution(values='', size_hint='', sort='') + @Appender(_shared_docs['factorize']) def factorize(self, na_sentinel=-1): - """Encode the Categorical as an enumerated type. - - Parameters - ---------- - sort : boolean, default False - Sort by values - na_sentinel: int, default -1 - Value to mark "not found" - - Returns - ------- - labels : ndarray - An integer NumPy array that's an indexer into the original - Categorical - uniques : Categorical - A Categorical whose values are the unique values and - whose dtype matches the original CategoricalDtype. Note that if - there any unobserved categories in ``self`` will not be present - in ``uniques.values``. They will be present in - ``uniques.categories`` - - Examples - -------- - >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) - >>> labels, uniques = cat.factorize() - >>> labels - (array([0, 0, 1]), - >>> uniques - [a, c] - Categories (3, object): [a, b, c]) - - Missing values are handled - - >>> labels, uniques = pd.factorize(pd.Categorical(['a', 'b', None])) - >>> labels - array([ 0, 1, -1]) - >>> uniques - [a, b] - Categories (2, object): [a, b] - """ from pandas.core.algorithms import _factorize_array codes = self.codes.astype('int64') diff --git a/pandas/core/base.py b/pandas/core/base.py index 257b26b64e642..56c3f96461088 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -2,6 +2,7 @@ Base and utility classes for pandas objects. """ import warnings +import textwrap from pandas import compat from pandas.compat import builtins import numpy as np @@ -1092,22 +1093,110 @@ def memory_usage(self, deep=False): v += lib.memory_usage_of_objects(self.values) return v - def factorize(self, sort=False, na_sentinel=-1): - """ - Encode the object as an enumerated type or categorical variable - - Parameters - ---------- - sort : boolean, default False - Sort by values - na_sentinel: int, default -1 - Value to mark "not found" + _shared_docs['factorize'] = """ + Encode the object as an enumerated type or categorical variable. + + This method is useful for obtaining a numeric representation of + when all that matters is identifying distinct values. `factorize` + is available as both a top-level function :func:`pandas.factorize`, + and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. + + Parameters + ---------- + %(values)s%(sort)s + na_sentinel : int, default -1 + Value to mark "not found". + %(size_hint)s\ + + Returns + ------- + labels : ndarray + An integer ndarray that's an indexer into `uniques`. + ``uniques.take(labels)`` will have the same values as `values`. + uniques : ndarray, Index, or Categorical + The unique valid values. When `values` is Categorical, `uniques` + is a Categorical. When `values` is some other pandas object, an + `Index` is returned. Otherwise, a 1-D ndarray is returned. + + .. note :: + + Even if there's a missing value in `values`, `uniques` will + *not* contain an entry for it. + + See Also + -------- + pandas.cut : Discretize continuous-valued array. + + Examples + -------- + These examples all show factorize as a top-level method like + ``pd.factorize(values)``. The results are identical for methods like + ``Series.factorize``. + + >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) + >>> labels + array([0, 0, 1, 2, 0]) + >>> uniques + array(['b', 'a', 'c'], dtype=object) + + With ``sort=True``, the `uniques` will be sorted, and `labels` will be + shulffled so that the relationship is the maintained. + + >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) + >>> labels + array([1, 1, 0, 2, 1]) + + >>> uniques + array(['a', 'b', 'c'], dtype=object) + + Missing values are indicated by `na_sentinel` (``-1`` by default). Note + that missing values are never included in `uniques`. + + >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) + >>> labels + array([ 0, -1, 1, 2, 0]) + + >>> uniques + array(['b', 'a', 'c'], dtype=object) + + Thus far, we've only factorized lists (which are internally coerced to + NumPy arrays). When factorizing pandas objects, the type of `uniques` + will differ. For Categoricals, a `Categorical` is returned. + + >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) + >>> labels, uniques = pd.factorize(cat) + >>> labels + array([0, 0, 1]) + + >>> uniques + [a, c] + Categories (3, object): [a, b, c] + + Notice that ``'b'`` is in ``uniques.categories``, desipite not being + present in ``cat.values``. + + For all other pandas objects, an Index of the appropriate type is + returned. + + >>> cat = pd.Series(['a', 'a', 'c']) + >>> labels, uniques = pd.factorize(cat) + >>> labels + array([0, 0, 1]) + + >>> uniques + Index(['a', 'c'], dtype='object') + """ - Returns - ------- - labels : the indexer to the original array - uniques : the unique Index - """ + @Substitution( + values='', + sort=textwrap.dedent("""\ + sort : boolean, default False + Sort `uniques` and shuffle `labels` to maintain the + relationship. + """), + size_hint='') + @Appender(_shared_docs['factorize']) + def factorize(self, sort=False, na_sentinel=-1): from pandas.core.algorithms import factorize return factorize(self, sort=sort, na_sentinel=na_sentinel) From 9b0c2a99dc70e60edbadec2778aaca64576f5857 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Mar 2018 14:36:39 -0500 Subject: [PATCH 16/47] Move to algorithms --- pandas/core/algorithms.py | 98 +++++++++++++++++++++++++++++- pandas/core/arrays/categorical.py | 3 +- pandas/core/base.py | 99 +------------------------------ 3 files changed, 101 insertions(+), 99 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 00ca82bb1c49d..02510b488ded7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -8,7 +8,6 @@ import numpy as np -from pandas.core.base import _shared_docs from pandas.core.dtypes.cast import ( maybe_promote, construct_1d_object_array_from_listlike) from pandas.core.dtypes.generic import ( @@ -40,6 +39,8 @@ from pandas.util._decorators import (Appender, Substitution, deprecate_kwarg) +_shared_docs = {} + # --------------- # # dtype access # @@ -467,6 +468,101 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): return labels, uniques +_shared_docs['factorize'] = """ + Encode the object as an enumerated type or categorical variable. + + This method is useful for obtaining a numeric representation of + when all that matters is identifying distinct values. `factorize` + is available as both a top-level function :func:`pandas.factorize`, + and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. + + Parameters + ---------- + %(values)s%(sort)s + na_sentinel : int, default -1 + Value to mark "not found". + %(size_hint)s\ + + Returns + ------- + labels : ndarray + An integer ndarray that's an indexer into `uniques`. + ``uniques.take(labels)`` will have the same values as `values`. + uniques : ndarray, Index, or Categorical + The unique valid values. When `values` is Categorical, `uniques` + is a Categorical. When `values` is some other pandas object, an + `Index` is returned. Otherwise, a 1-D ndarray is returned. + + .. note :: + + Even if there's a missing value in `values`, `uniques` will + *not* contain an entry for it. + + See Also + -------- + pandas.cut : Discretize continuous-valued array. + + Examples + -------- + These examples all show factorize as a top-level method like + ``pd.factorize(values)``. The results are identical for methods like + ``Series.factorize``. + + >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) + >>> labels + array([0, 0, 1, 2, 0]) + >>> uniques + array(['b', 'a', 'c'], dtype=object) + + With ``sort=True``, the `uniques` will be sorted, and `labels` will be + shulffled so that the relationship is the maintained. + + >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) + >>> labels + array([1, 1, 0, 2, 1]) + + >>> uniques + array(['a', 'b', 'c'], dtype=object) + + Missing values are indicated by `na_sentinel` (``-1`` by default). Note + that missing values are never included in `uniques`. + + >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) + >>> labels + array([ 0, -1, 1, 2, 0]) + + >>> uniques + array(['b', 'a', 'c'], dtype=object) + + Thus far, we've only factorized lists (which are internally coerced to + NumPy arrays). When factorizing pandas objects, the type of `uniques` + will differ. For Categoricals, a `Categorical` is returned. + + >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) + >>> labels, uniques = pd.factorize(cat) + >>> labels + array([0, 0, 1]) + + >>> uniques + [a, c] + Categories (3, object): [a, b, c] + + Notice that ``'b'`` is in ``uniques.categories``, desipite not being + present in ``cat.values``. + + For all other pandas objects, an Index of the appropriate type is + returned. + + >>> cat = pd.Series(['a', 'a', 'c']) + >>> labels, uniques = pd.factorize(cat) + >>> labels + array([0, 0, 1]) + + >>> uniques + Index(['a', 'c'], dtype='object') + """ + + @Substitution( values=dedent("""\ values : Sequence diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ed73b27b4008c..0676bac18fb49 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -31,6 +31,7 @@ is_dict_like) from pandas.core.algorithms import factorize, take_1d, unique1d +from pandas.core.algorithms import _shared_docs as _algos_shared_docs from pandas.core.accessor import PandasDelegate from pandas.core.base import (PandasObject, NoNewAttributesMixin, _shared_docs) @@ -2046,7 +2047,7 @@ def unique(self): return cat.set_categories(cat.categories.take(take_codes)) @Substitution(values='', size_hint='', sort='') - @Appender(_shared_docs['factorize']) + @Appender(_algos_shared_docs['factorize']) def factorize(self, na_sentinel=-1): from pandas.core.algorithms import _factorize_array diff --git a/pandas/core/base.py b/pandas/core/base.py index 56c3f96461088..580a3bb89dede 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1093,100 +1093,6 @@ def memory_usage(self, deep=False): v += lib.memory_usage_of_objects(self.values) return v - _shared_docs['factorize'] = """ - Encode the object as an enumerated type or categorical variable. - - This method is useful for obtaining a numeric representation of - when all that matters is identifying distinct values. `factorize` - is available as both a top-level function :func:`pandas.factorize`, - and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. - - Parameters - ---------- - %(values)s%(sort)s - na_sentinel : int, default -1 - Value to mark "not found". - %(size_hint)s\ - - Returns - ------- - labels : ndarray - An integer ndarray that's an indexer into `uniques`. - ``uniques.take(labels)`` will have the same values as `values`. - uniques : ndarray, Index, or Categorical - The unique valid values. When `values` is Categorical, `uniques` - is a Categorical. When `values` is some other pandas object, an - `Index` is returned. Otherwise, a 1-D ndarray is returned. - - .. note :: - - Even if there's a missing value in `values`, `uniques` will - *not* contain an entry for it. - - See Also - -------- - pandas.cut : Discretize continuous-valued array. - - Examples - -------- - These examples all show factorize as a top-level method like - ``pd.factorize(values)``. The results are identical for methods like - ``Series.factorize``. - - >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) - >>> labels - array([0, 0, 1, 2, 0]) - >>> uniques - array(['b', 'a', 'c'], dtype=object) - - With ``sort=True``, the `uniques` will be sorted, and `labels` will be - shulffled so that the relationship is the maintained. - - >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) - >>> labels - array([1, 1, 0, 2, 1]) - - >>> uniques - array(['a', 'b', 'c'], dtype=object) - - Missing values are indicated by `na_sentinel` (``-1`` by default). Note - that missing values are never included in `uniques`. - - >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) - >>> labels - array([ 0, -1, 1, 2, 0]) - - >>> uniques - array(['b', 'a', 'c'], dtype=object) - - Thus far, we've only factorized lists (which are internally coerced to - NumPy arrays). When factorizing pandas objects, the type of `uniques` - will differ. For Categoricals, a `Categorical` is returned. - - >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) - >>> labels, uniques = pd.factorize(cat) - >>> labels - array([0, 0, 1]) - - >>> uniques - [a, c] - Categories (3, object): [a, b, c] - - Notice that ``'b'`` is in ``uniques.categories``, desipite not being - present in ``cat.values``. - - For all other pandas objects, an Index of the appropriate type is - returned. - - >>> cat = pd.Series(['a', 'a', 'c']) - >>> labels, uniques = pd.factorize(cat) - >>> labels - array([0, 0, 1]) - - >>> uniques - Index(['a', 'c'], dtype='object') - """ - @Substitution( values='', sort=textwrap.dedent("""\ @@ -1195,10 +1101,9 @@ def memory_usage(self, deep=False): relationship. """), size_hint='') - @Appender(_shared_docs['factorize']) + @Appender(algorithms._shared_docs['factorize']) def factorize(self, sort=False, na_sentinel=-1): - from pandas.core.algorithms import factorize - return factorize(self, sort=sort, na_sentinel=na_sentinel) + return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) _shared_docs['searchsorted'] = ( """Find indices where elements should be inserted to maintain order. From eb19488919575676c41cac4f2da0ac0335f881a0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Mar 2018 14:42:32 -0500 Subject: [PATCH 17/47] BUG: py2 bug --- pandas/tests/extension/decimal/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 736556e4be20d..f1852542088ff 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -71,6 +71,7 @@ def isna(self): return np.array([x.is_nan() for x in self.values]) def take(self, indexer, allow_fill=True, fill_value=None): + indexer = np.asarray(indexer) mask = indexer == -1 indexer = _ensure_platform_int(indexer) From cbfee1abd71c731a3a753fd983c7e004f20abe71 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Mar 2018 14:48:23 -0500 Subject: [PATCH 18/47] Typo, ref --- pandas/core/algorithms.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 02510b488ded7..bb40800f796b6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -506,7 +506,7 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): -------- These examples all show factorize as a top-level method like ``pd.factorize(values)``. The results are identical for methods like - ``Series.factorize``. + :meth:`Series.factorize`. >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) >>> labels @@ -515,12 +515,11 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): array(['b', 'a', 'c'], dtype=object) With ``sort=True``, the `uniques` will be sorted, and `labels` will be - shulffled so that the relationship is the maintained. + shuffled so that the relationship is the maintained. >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) >>> labels array([1, 1, 0, 2, 1]) - >>> uniques array(['a', 'b', 'c'], dtype=object) @@ -530,7 +529,6 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) >>> labels array([ 0, -1, 1, 2, 0]) - >>> uniques array(['b', 'a', 'c'], dtype=object) @@ -542,7 +540,6 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): >>> labels, uniques = pd.factorize(cat) >>> labels array([0, 0, 1]) - >>> uniques [a, c] Categories (3, object): [a, b, c] From 35a8977381559571f1eeabe52523b3ddc2791fcb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Mar 2018 06:55:22 -0500 Subject: [PATCH 19/47] Change name --- pandas/core/arrays/base.py | 25 ++++++++++++++++++++----- pandas/core/arrays/categorical.py | 2 +- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 322f71ad83580..7bb6165d0b760 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -237,16 +237,31 @@ def isna(self): """ raise AbstractMethodError(self) - def _values_for_argsort(self): + def _simple_ndarray(self): # type: () -> ndarray - """Get the ndarray to be passed to np.argsort. + """Convert the array to a simple ndarray representaiton. - This is called from within 'ExtensionArray.argsort'. + Many methods can operate indirectly on a cheap-to-compute array that + is somehow representative of the extension array. For example, rather + than sorting an ExtensionArray directly, which might be expensive, + we could convert the ExtensionArray to a representative ndarray of + integers, sort the integers, and perform a ``take``. + + The coversion between ExtensionArray and the simple ndarray should be + strictly monotonic https://en.wikipedia.org/wiki/Monotonic_function, + and as cheap to compute as possible. Returns ------- values : ndarray + + See Also + -------- + ExtensionArray.argsort """ + # Implemnetor note: This method is currently used in + # - ExtensionArray.argsort + return np.array(self) def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): @@ -274,11 +289,11 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ # Implementor note: You have two places to override the behavior of # argsort. - # 1. _values_for_argsort : construct the values passed to np.argsort + # 1. _simple_ndarray : construct the values passed to np.argsort # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._values_for_argsort() + values = self._simple_ndarray() result = np.argsort(values, kind=kind, **kwargs) if not ascending: result = result[::-1] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4b1238f4fbf05..814bf70c102e6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1390,7 +1390,7 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) - def _values_for_argsort(self): + def _simple_ndarray(self): return self._codes.copy() def argsort(self, *args, **kwargs): From ef8e6cb721180635cccede92d25f40d4568118e6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Mar 2018 07:24:12 -0500 Subject: [PATCH 20/47] Fixed docs --- pandas/core/algorithms.py | 19 +++++++++++++------ pandas/core/arrays/categorical.py | 2 +- pandas/core/base.py | 5 ++--- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bb40800f796b6..9beea013f776b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -478,7 +478,7 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): Parameters ---------- - %(values)s%(sort)s + %(values)s%(sort)s%(order)s na_sentinel : int, default -1 Value to mark "not found". %(size_hint)s\ @@ -501,6 +501,7 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): See Also -------- pandas.cut : Discretize continuous-valued array. + pandas.unique : Find the unique valuse in an array. Examples -------- @@ -523,8 +524,9 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): >>> uniques array(['a', 'b', 'c'], dtype=object) - Missing values are indicated by `na_sentinel` (``-1`` by default). Note - that missing values are never included in `uniques`. + Missing values are indicated in `labels` with `na_sentinel` + (``-1`` by default). Note that missing values are never + included in `uniques`. >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) >>> labels @@ -554,7 +556,6 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): >>> labels, uniques = pd.factorize(cat) >>> labels array([0, 0, 1]) - >>> uniques Index(['a', 'c'], dtype='object') """ @@ -562,12 +563,18 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): @Substitution( values=dedent("""\ - values : Sequence + values : sequence A 1-D seqeunce. Sequences that aren't pandas objects are coereced to ndarrays before factorization. """), + order=dedent("""\ + order + .. deprecated:: 0.23.0 + + This parameter has no effect and is deprecated. + """), sort=dedent("""\ - sort : boolean, default False + sort : bool, default False Sort `uniques` and shuffle `labels` to maintain the relationship. """), diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0676bac18fb49..0e96eab9c708c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2046,7 +2046,7 @@ def unique(self): take_codes = sorted(take_codes) return cat.set_categories(cat.categories.take(take_codes)) - @Substitution(values='', size_hint='', sort='') + @Substitution(values='', size_hint='', sort='', order='') @Appender(_algos_shared_docs['factorize']) def factorize(self, na_sentinel=-1): from pandas.core.algorithms import _factorize_array diff --git a/pandas/core/base.py b/pandas/core/base.py index 580a3bb89dede..57b9dc86ded6a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1094,13 +1094,12 @@ def memory_usage(self, deep=False): return v @Substitution( - values='', + values='', order='', size_hint='', sort=textwrap.dedent("""\ sort : boolean, default False Sort `uniques` and shuffle `labels` to maintain the relationship. - """), - size_hint='') + """)) @Appender(algorithms._shared_docs['factorize']) def factorize(self, sort=False, na_sentinel=-1): return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) From 6a6034f2d7a1e5c7ae1c89ce1d5db93c585fb955 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 17 Mar 2018 07:26:28 -0500 Subject: [PATCH 21/47] Wording --- pandas/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9beea013f776b..6ce964cc50d33 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -471,8 +471,8 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): _shared_docs['factorize'] = """ Encode the object as an enumerated type or categorical variable. - This method is useful for obtaining a numeric representation of - when all that matters is identifying distinct values. `factorize` + This method is useful for obtaining a numeric representation of an + array when all that matters is identifying distinct values. `factorize` is available as both a top-level function :func:`pandas.factorize`, and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. From 55263985cbfae00fd58e42f14bce37eaca01d227 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 19 Mar 2018 10:07:53 -0500 Subject: [PATCH 22/47] fixup! Wording --- pandas/core/arrays/base.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c15a069f854d3..a7926dbb21654 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -76,6 +76,10 @@ def _constructor_from_sequence(cls, scalars): """ raise AbstractMethodError(cls) + @classmethod + def _constructor_from_simple_ndarray(cls, values, instance): + raise AbstractMethodError(cls) + # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ @@ -301,6 +305,9 @@ def unique(self): uniques = unique(self.astype(object)) return self._constructor_from_sequence(uniques) + def _simple_ndarray(self): + return self.astype(object) + def factorize(self, na_sentinel=-1): """Encode the extension array as an enumerated type. @@ -335,12 +342,12 @@ def factorize(self, na_sentinel=-1): from pandas.core.algorithms import _factorize_array mask = self.isna() - arr = self.astype(object) + arr = self._simple_ndarray() arr[mask] = np.nan labels, uniques = _factorize_array(arr, check_nulls=True, na_sentinel=na_sentinel) - uniques = self._constructor_from_sequence(uniques) + uniques = self._constructor_from_simple_ndarray(uniques, instance=arr) return labels, uniques # ------------------------------------------------------------------------ From d5e819822d9e980021ea17bfd197a300422d7015 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 19 Mar 2018 11:18:41 -0500 Subject: [PATCH 23/47] Back to _values_for_argsort --- pandas/core/arrays/base.py | 27 ++++++++------------------- pandas/core/arrays/categorical.py | 2 +- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8b539e20894a9..f4c77263bf21b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -237,31 +237,21 @@ def isna(self): """ raise AbstractMethodError(self) - def _simple_ndarray(self): + def _values_for_argsort(self): # type: () -> ndarray - """Convert the array to a simple ndarray representaiton. - - Many methods can operate indirectly on a cheap-to-compute array that - is somehow representative of the extension array. For example, rather - than sorting an ExtensionArray directly, which might be expensive, - we could convert the ExtensionArray to a representative ndarray of - integers, sort the integers, and perform a ``take``. - - The coversion between ExtensionArray and the simple ndarray should be - strictly monotonic https://en.wikipedia.org/wiki/Monotonic_function, - and as cheap to compute as possible. + """Return values for sorting. Returns ------- - values : ndarray + ndarray + The transformed values should maintain the ordering between values + within the array. See Also -------- ExtensionArray.argsort """ - # Implemnetor note: This method is currently used in - # - ExtensionArray.argsort - + # Note: this is used in `ExtensionArray.argsort`. return np.array(self) def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): @@ -289,11 +279,10 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ # Implementor note: You have two places to override the behavior of # argsort. - # 1. _simple_ndarray : construct the values passed to np.argsort + # 1. _values_for_argsort : construct the values passed to np.argsort # 2. argsort : total control over sorting. - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._simple_ndarray() + values = self._values_for_argsort() result = np.argsort(values, kind=kind, **kwargs) if not ascending: result = result[::-1] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 43b985686af89..13384dd56a9c1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1378,7 +1378,7 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) - def _simple_ndarray(self): + def _values_for_argsort(self): return self._codes.copy() def argsort(self, *args, **kwargs): From 30941cb2afe36f3cd78801a2e046a4114773e107 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 19 Mar 2018 12:04:30 -0500 Subject: [PATCH 24/47] Example with _from_factorize --- pandas/core/arrays/base.py | 35 ++++++++++++++++++++----- pandas/tests/extension/decimal/array.py | 4 +++ pandas/tests/extension/json/array.py | 20 +++++--------- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a7926dbb21654..a7930ca8a59b9 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -77,7 +77,21 @@ def _constructor_from_sequence(cls, scalars): raise AbstractMethodError(cls) @classmethod - def _constructor_from_simple_ndarray(cls, values, instance): + def _from_factorized(cls, values, original): + """Reconstruct an ExtensionArray after factorization. + + Parameters + ---------- + values : ndarray + An integer ndarray with the factorized values. + original : ExtensionArray + The original ndarray that was factorized. + + See Also + -------- + pandas.factorize + ExtensionArray.factorize + """ raise AbstractMethodError(cls) # ------------------------------------------------------------------------ @@ -305,7 +319,16 @@ def unique(self): uniques = unique(self.astype(object)) return self._constructor_from_sequence(uniques) - def _simple_ndarray(self): + def _values_for_factorize(self): + """Return an array suitable for factorization. + + Returns + ------- + ndarray + An array suitable for factoraization. This should maintain order + and be a supported dtype. + + """ return self.astype(object) def factorize(self, na_sentinel=-1): @@ -337,17 +360,17 @@ def factorize(self, na_sentinel=-1): ----- :meth:`pandas.factorize` offers a `sort` keyword as well. """ - # Implementor note: make sure to exclude missing values from your - # `uniques`. It should only contain non-NA values. + # Implementor notes: There are two options for overriding the + # behavior of `factorize`: here and `_values_for_factorize`. from pandas.core.algorithms import _factorize_array mask = self.isna() - arr = self._simple_ndarray() + arr = self._values_for_factorize() arr[mask] = np.nan labels, uniques = _factorize_array(arr, check_nulls=True, na_sentinel=na_sentinel) - uniques = self._constructor_from_simple_ndarray(uniques, instance=arr) + uniques = self._from_factorized(uniques, arr) return labels, uniques # ------------------------------------------------------------------------ diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index f1852542088ff..b66a14c77a059 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -36,6 +36,10 @@ def __init__(self, values): def _constructor_from_sequence(cls, scalars): return cls(scalars) + @classmethod + def _from_factorized(cls, values, original): + return cls(values) + def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.values[item] diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index fcfde3911b808..31e382d45bf49 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -7,7 +7,6 @@ import numpy as np -import pandas as pd from pandas.core.dtypes.base import ExtensionDtype from pandas.core.arrays import ExtensionArray @@ -38,6 +37,10 @@ def __init__(self, values): def _constructor_from_sequence(cls, scalars): return cls(scalars) + @classmethod + def _from_factorized(cls, values, original): + return cls([collections.UserDict(x) for x in values if x != ()]) + def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.data[item] @@ -105,20 +108,9 @@ def _concat_same_type(cls, to_concat): data = list(itertools.chain.from_iterable([x.data for x in to_concat])) return cls(data) - def factorize(self, na_sentinel=-1): + def _values_for_factorize(self): frozen = tuple(tuple(x.items()) for x in self) - labels, uniques = pd.factorize(frozen) - - # fixup NA - if self.isna().any(): - na_code = labels[self.isna()][0] - - labels[labels == na_code] = na_sentinel - labels[labels > na_code] -= 1 - - uniques = JSONArray([collections.UserDict(x) - for x in uniques if x != ()]) - return labels, uniques + return np.array(frozen, dtype=object) def make_data(): From c776133d42beb81d94ab5000da8bb2ad56f53e90 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Mar 2018 13:58:25 -0500 Subject: [PATCH 25/47] Unskip most JSON tests --- pandas/tests/extension/json/array.py | 11 +++++++++++ pandas/tests/extension/json/test_json.py | 25 ++++++------------------ 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 322944129146a..ee0951812b8f0 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -44,7 +44,11 @@ def __getitem__(self, item): return self._constructor_from_sequence([ x for x, m in zip(self, item) if m ]) + elif isinstance(item, collections.Iterable): + # fancy indexing + return type(self)([self.data[i] for i in item]) else: + # slice return type(self)(self.data[item]) def __setitem__(self, key, value): @@ -104,6 +108,13 @@ def _concat_same_type(cls, to_concat): data = list(itertools.chain.from_iterable([x.data for x in to_concat])) return cls(data) + def _values_for_argsort(self): + # Disable NumPy's shape inference by including an empty tuple... + # If all the elemnts of self are the same size P, NumPy will + # cast them to an (N, P) array, instead of an (N,) array of tuples. + frozen = [()] + list(tuple(x.items()) for x in self) + return np.array(frozen, dtype=object)[1:] + def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index eab75c306e02a..339be8a2633cb 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -36,7 +36,7 @@ def data_for_sorting(): @pytest.fixture def data_missing_for_sorting(): - return JSONArray([{'b': 1}, {}, {'c': 4}]) + return JSONArray([{'b': 1}, {}, {'a': 4}]) @pytest.fixture @@ -80,28 +80,15 @@ def test_fillna_frame(self): class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="Unhashable") - def test_value_counts(self, all_data, dropna): - pass + unhashable = pytest.mark.skip(reason="Unhashable") - @pytest.mark.skip(reason="Dictionaries are not orderable.") - def test_argsort(self): - pass - - @pytest.mark.skip(reason="Dictionaries are not orderable.") - def test_argsort_missing(self): - pass - - @pytest.mark.skip(reason="Dictionaries are not orderable.") - def test_sort_values(self): - pass - - @pytest.mark.skip(reason="Dictionaries are not orderable.") - def test_sort_values_missing(self): + @unhashable + def test_value_counts(self, all_data, dropna): pass - @pytest.mark.skip(reason="Dictionaries are not orderable.") + @unhashable def test_sort_values_frame(self): + # TODO (EA.factorize): see if _values_for_factorize allows this. pass From 6ca65f867895b23b95c9439612abec4e6724697c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Mar 2018 07:57:32 -0500 Subject: [PATCH 26/47] Overridable na_value too. This has the EA author specify the NA value before factorization. This satisfies the Categorical use case. --- pandas/core/arrays/base.py | 22 +++++++++++++--------- pandas/core/arrays/categorical.py | 21 ++++++++------------- pandas/tests/extension/json/array.py | 2 +- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8f1f1eb208acb..2369df1bc5db6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -86,7 +86,7 @@ def _from_factorized(cls, values, original): values : ndarray An integer ndarray with the factorized values. original : ExtensionArray - The original ndarray that was factorized. + The original ExtensionArray that factorize was called on. See Also -------- @@ -372,16 +372,20 @@ def unique(self): return self._constructor_from_sequence(uniques) def _values_for_factorize(self): - """Return an array suitable for factorization. + """Return an array and missing value suitable for factorization. Returns ------- - ndarray + values : ndarray An array suitable for factoraization. This should maintain order - and be a supported dtype. - + and be a supported dtype. By default, the extension array is cast + to object dtype. + na_value : scalar + The missing value to insert before factorization. Note that this + differs from `na_sentinel`, which is in the missing value sentinel + after factorization. By default, ``np.nan`` is used. """ - return self.astype(object) + return self.astype(object), np.nan def factorize(self, na_sentinel=-1): """Encode the extension array as an enumerated type. @@ -417,12 +421,12 @@ def factorize(self, na_sentinel=-1): from pandas.core.algorithms import _factorize_array mask = self.isna() - arr = self._values_for_factorize() - arr[mask] = np.nan + arr, na_value = self._values_for_factorize() + arr[mask] = na_value labels, uniques = _factorize_array(arr, check_nulls=True, na_sentinel=na_sentinel) - uniques = self._from_factorized(uniques, arr) + uniques = self._from_factorized(uniques, self) return labels, uniques # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d05d966c642fb..05b12b51e1d82 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -31,7 +31,6 @@ is_dict_like) from pandas.core.algorithms import factorize, take_1d, unique1d -from pandas.core.algorithms import _shared_docs as _algos_shared_docs from pandas.core.accessor import PandasDelegate from pandas.core.base import (PandasObject, NoNewAttributesMixin, _shared_docs) @@ -2067,21 +2066,17 @@ def unique(self): take_codes = sorted(take_codes) return cat.set_categories(cat.categories.take(take_codes)) - @Substitution(values='', size_hint='', sort='', order='') - @Appender(_algos_shared_docs['factorize']) - def factorize(self, na_sentinel=-1): - from pandas.core.algorithms import _factorize_array - + def _values_for_factorize(self): codes = self.codes.astype('int64') - codes[codes == -1] = iNaT # We set missing codes, normally -1, to iNaT so that the # Int64HashTable treats them as missing values. - labels, uniques = _factorize_array(codes, check_nulls=True, - na_sentinel=na_sentinel) - uniques = self._constructor(self.categories.take(uniques), - categories=self.categories, - ordered=self.ordered) - return labels, uniques + return codes, iNaT + + @classmethod + def _from_factorized(cls, uniques, original): + return original._constructor(original.categories.take(uniques), + categories=original.categories, + ordered=original.ordered) def equals(self, other): """ diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 32da0acf6fb48..dadc96ae983df 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -114,7 +114,7 @@ def _concat_same_type(cls, to_concat): def _values_for_factorize(self): frozen = tuple(tuple(x.items()) for x in self) - return np.array(frozen, dtype=object) + return np.array(frozen, dtype=object), np.nan def _values_for_argsort(self): # Disable NumPy's shape inference by including an empty tuple... From bbedd8c4fff5a9456ed3d589e61b264f07a40c09 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Mar 2018 08:05:54 -0500 Subject: [PATCH 27/47] Reverted sorting changes --- pandas/core/arrays/base.py | 52 ------------------- pandas/core/arrays/categorical.py | 51 ++++++------------ pandas/tests/extension/base/methods.py | 40 -------------- .../extension/category/test_categorical.py | 12 ----- pandas/tests/extension/conftest.py | 20 ------- .../tests/extension/decimal/test_decimal.py | 14 ----- pandas/tests/extension/json/array.py | 7 --- pandas/tests/extension/json/test_json.py | 10 ---- 8 files changed, 16 insertions(+), 190 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2369df1bc5db6..3d65dc38b8d55 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2,7 +2,6 @@ import numpy as np from pandas.errors import AbstractMethodError -from pandas.compat.numpy import function as nv _not_implemented_message = "{} does not implement {}." @@ -255,57 +254,6 @@ def isna(self): """ raise AbstractMethodError(self) - def _values_for_argsort(self): - # type: () -> ndarray - """Return values for sorting. - - Returns - ------- - ndarray - The transformed values should maintain the ordering between values - within the array. - - See Also - -------- - ExtensionArray.argsort - """ - # Note: this is used in `ExtensionArray.argsort`. - return np.array(self) - - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): - """ - Return the indices that would sort this array. - - Parameters - ---------- - ascending : bool, default True - Whether the indices should result in an ascending - or descending sort. - kind : {'quicksort', 'mergesort', 'heapsort'}, optional - Sorting algorithm. - *args, **kwargs: - passed through to :func:`numpy.argsort`. - - Returns - ------- - index_array : ndarray - Array of indices that sort ``self``. - - See Also - -------- - numpy.argsort : Sorting implementation used internally. - """ - # Implementor note: You have two places to override the behavior of - # argsort. - # 1. _values_for_argsort : construct the values passed to np.argsort - # 2. argsort : total control over sorting. - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._values_for_argsort() - result = np.argsort(values, kind=kind, **kwargs) - if not ascending: - result = result[::-1] - return result - def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 05b12b51e1d82..0d9f2f82371b6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1381,21 +1381,18 @@ def check_for_ordered(self, op): def _values_for_argsort(self): return self._codes.copy() - def argsort(self, *args, **kwargs): - # TODO(PY2): use correct signature - # We have to do *args, **kwargs to avoid a a py2-only signature - # issue since np.argsort differs from argsort. - """Return the indicies that would sort the Categorical. + def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): + """ + Returns the indices that would sort the Categorical instance if + 'sort_values' was called. This function is implemented to provide + compatibility with numpy ndarray objects. + + While an ordering is applied to the category values, arg-sorting + in this context refers more to organizing and grouping together + based on matching category values. Thus, this function can be + called on an unordered Categorical instance unlike the functions + 'Categorical.min' and 'Categorical.max'. - Parameters - ---------- - ascending : bool, default True - Whether the indices should result in an ascending - or descending sort. - kind : {'quicksort', 'mergesort', 'heapsort'}, optional - Sorting algorithm. - *args, **kwargs: - passed through to :func:`numpy.argsort`. Returns ------- @@ -1404,28 +1401,12 @@ def argsort(self, *args, **kwargs): See also -------- numpy.ndarray.argsort - - Notes - ----- - While an ordering is applied to the category values, arg-sorting - in this context refers more to organizing and grouping together - based on matching category values. Thus, this function can be - called on an unordered Categorical instance unlike the functions - 'Categorical.min' and 'Categorical.max'. - - Examples - -------- - >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() - array([2, 0, 1, 3]) - - >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], - ... categories=['c', 'b', 'a'], - ... ordered=True) - >>> cat.argsort() - array([3, 0, 1, 2]) """ - # Keep the implementation here just for the docstring. - return super(Categorical, self).argsort(*args, **kwargs) + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + result = np.argsort(self._codes.copy(), kind=kind, **kwargs) + if not ascending: + result = result[::-1] + return result def sort_values(self, inplace=False, ascending=True, na_position='last'): """ Sorts the Categorical by category value returning a new diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index f9f079cb21858..577fbbaf477c9 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -33,46 +33,6 @@ def test_apply_simple_series(self, data): result = pd.Series(data).apply(id) assert isinstance(result, pd.Series) - def test_argsort(self, data_for_sorting): - result = pd.Series(data_for_sorting).argsort() - expected = pd.Series(np.array([2, 0, 1], dtype=np.int64)) - self.assert_series_equal(result, expected) - - def test_argsort_missing(self, data_missing_for_sorting): - result = pd.Series(data_missing_for_sorting).argsort() - expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) - self.assert_series_equal(result, expected) - - @pytest.mark.parametrize('ascending', [True, False]) - def test_sort_values(self, data_for_sorting, ascending): - ser = pd.Series(data_for_sorting) - result = ser.sort_values(ascending=ascending) - expected = ser.iloc[[2, 0, 1]] - if not ascending: - expected = expected[::-1] - - self.assert_series_equal(result, expected) - - @pytest.mark.parametrize('ascending', [True, False]) - def test_sort_values_missing(self, data_missing_for_sorting, ascending): - ser = pd.Series(data_missing_for_sorting) - result = ser.sort_values(ascending=ascending) - if ascending: - expected = ser.iloc[[2, 0, 1]] - else: - expected = ser.iloc[[0, 2, 1]] - self.assert_series_equal(result, expected) - - @pytest.mark.parametrize('ascending', [True, False]) - def test_sort_values_frame(self, data_for_sorting, ascending): - df = pd.DataFrame({"A": [1, 2, 1], - "B": data_for_sorting}) - result = df.sort_values(['A', 'B']) - expected = pd.DataFrame({"A": [1, 1, 2], - 'B': data_for_sorting.take([2, 0, 1])}, - index=[2, 0, 1]) - self.assert_frame_equal(result, expected) - @pytest.mark.parametrize('box', [pd.Series, lambda x: x]) @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 7528299578326..a351dd033df7c 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -29,18 +29,6 @@ def data_missing(): return Categorical([np.nan, 'A']) -@pytest.fixture -def data_for_sorting(): - return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'], - ordered=True) - - -@pytest.fixture -def data_missing_for_sorting(): - return Categorical(['A', None, 'B'], categories=['B', 'A'], - ordered=True) - - @pytest.fixture def na_value(): return np.nan diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 4cb4ea21d9be3..9904fd7118d27 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -30,26 +30,6 @@ def all_data(request, data, data_missing): return data_missing -@pytest.fixture -def data_for_sorting(): - """Length-3 array with a known sort order. - - This should be three items [B, C, A] with - A < B < C - """ - raise NotImplementedError - - -@pytest.fixture -def data_missing_for_sorting(): - """Length-3 array with a known sort order. - - This should be three items [B, NA, A] with - A < B and NA missing. - """ - raise NotImplementedError - - @pytest.fixture def na_cmp(): """Binary operator for comparing NA values. diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 0f2a60e1831f6..6c54ae2a5ea71 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -25,20 +25,6 @@ def data_missing(): return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) -@pytest.fixture -def data_for_sorting(): - return DecimalArray([decimal.Decimal('1'), - decimal.Decimal('2'), - decimal.Decimal('0')]) - - -@pytest.fixture -def data_missing_for_sorting(): - return DecimalArray([decimal.Decimal('1'), - decimal.Decimal('NaN'), - decimal.Decimal('0')]) - - @pytest.fixture def na_cmp(): return lambda x, y: x.is_nan() and y.is_nan() diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index dadc96ae983df..4692e1f5932c6 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -116,13 +116,6 @@ def _values_for_factorize(self): frozen = tuple(tuple(x.items()) for x in self) return np.array(frozen, dtype=object), np.nan - def _values_for_argsort(self): - # Disable NumPy's shape inference by including an empty tuple... - # If all the elemnts of self are the same size P, NumPy will - # cast them to an (N, P) array, instead of an (N,) array of tuples. - frozen = [()] + list(tuple(x.items()) for x in self) - return np.array(frozen, dtype=object)[1:] - def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index e2b5236453913..2a497aa5f7a24 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -29,16 +29,6 @@ def data_missing(): return JSONArray([{}, {'a': 10}]) -@pytest.fixture -def data_for_sorting(): - return JSONArray([{'b': 1}, {'c': 4}, {'a': 2, 'c': 3}]) - - -@pytest.fixture -def data_missing_for_sorting(): - return JSONArray([{'b': 1}, {}, {'a': 4}]) - - @pytest.fixture def na_value(): return {} From 96ecab77f7dc21f3b943a28b8d53fb58c1e1458a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 22 Mar 2018 08:11:27 -0500 Subject: [PATCH 28/47] Remove a bit more argsort --- pandas/core/arrays/categorical.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0d9f2f82371b6..2c098beda7bae 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1378,9 +1378,6 @@ def check_for_ordered(self, op): "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) - def _values_for_argsort(self): - return self._codes.copy() - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ Returns the indices that would sort the Categorical instance if @@ -1393,7 +1390,6 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): called on an unordered Categorical instance unlike the functions 'Categorical.min' and 'Categorical.max'. - Returns ------- argsorted : numpy array From c288d676cbb51373d9c2c25489d3e56db64e57fd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Mar 2018 10:19:30 -0500 Subject: [PATCH 29/47] Mask values going into hashtables 1. Offer _values_for_factorize 2. Hardcode the NA type going into the hashtable. --- pandas/core/arrays/base.py | 20 +++++++++++--------- pandas/core/arrays/categorical.py | 2 +- pandas/tests/extension/json/array.py | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2369df1bc5db6..4443054b81d4b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -380,12 +380,8 @@ def _values_for_factorize(self): An array suitable for factoraization. This should maintain order and be a supported dtype. By default, the extension array is cast to object dtype. - na_value : scalar - The missing value to insert before factorization. Note that this - differs from `na_sentinel`, which is in the missing value sentinel - after factorization. By default, ``np.nan`` is used. """ - return self.astype(object), np.nan + return self.astype(object) def factorize(self, na_sentinel=-1): """Encode the extension array as an enumerated type. @@ -416,13 +412,19 @@ def factorize(self, na_sentinel=-1): ----- :meth:`pandas.factorize` offers a `sort` keyword as well. """ - # Implementor notes: There are two options for overriding the - # behavior of `factorize`: here and `_values_for_factorize`. from pandas.core.algorithms import _factorize_array + import pandas.core.dtypes.common as com + from pandas._libs.tslib import iNaT mask = self.isna() - arr, na_value = self._values_for_factorize() - arr[mask] = na_value + arr = self._values_for_factorize() + + # Mask values going into the hash table with the appropriate + # NA type. + if com.is_signed_integer_dtype(arr): + arr[mask] = iNaT + elif com.is_float_dtype(arr) or com.is_object_dtype(arr): + arr[mask] = np.nan labels, uniques = _factorize_array(arr, check_nulls=True, na_sentinel=na_sentinel) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b4b09d0e2c193..8995671f94bd9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2123,7 +2123,7 @@ def _values_for_factorize(self): codes = self.codes.astype('int64') # We set missing codes, normally -1, to iNaT so that the # Int64HashTable treats them as missing values. - return codes, iNaT + return codes @classmethod def _from_factorized(cls, uniques, original): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index dadc96ae983df..32da0acf6fb48 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -114,7 +114,7 @@ def _concat_same_type(cls, to_concat): def _values_for_factorize(self): frozen = tuple(tuple(x.items()) for x in self) - return np.array(frozen, dtype=object), np.nan + return np.array(frozen, dtype=object) def _values_for_argsort(self): # Disable NumPy's shape inference by including an empty tuple... From 55c9e31108c3ad84b715073ca9d3435e9ad84a4f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Mar 2018 11:23:40 -0500 Subject: [PATCH 30/47] remove stale comment --- pandas/core/arrays/categorical.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 8995671f94bd9..ab69fed9539b0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2121,8 +2121,6 @@ def unique(self): def _values_for_factorize(self): codes = self.codes.astype('int64') - # We set missing codes, normally -1, to iNaT so that the - # Int64HashTable treats them as missing values. return codes @classmethod From 163bfa3fbf6879eadd4cc1bdb9940bdd7afaf744 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Mar 2018 13:07:00 -0500 Subject: [PATCH 31/47] wip --- pandas/core/algorithms.py | 15 ++++++++++++++- pandas/core/arrays/base.py | 23 ++++++++++++++--------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6ce964cc50d33..6024c52d55c6b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -585,7 +585,7 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): ) @Appender(_shared_docs['factorize']) @deprecate_kwarg(old_arg_name='order', new_arg_name=None) -def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): +def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, mask=None): # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) # 2.) factorizing labels and uniques @@ -596,14 +596,27 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): # should happen here. values = _ensure_arraylike(values) + original = values + # Mask values going into the hash table with the appropriate + # NA type. + if mask is not None: + values = values.copy() + if com.is_signed_integer_dtype(values): + values[mask] = iNaT + elif com.is_float_dtype(values) or com.is_object_dtype(values): + values[mask] = np.nan + + # ughhhhhhhhhhhhhhhhhh still have uint64 issues + if is_extension_array_dtype(values): values = getattr(values, '_values', values) labels, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: values, dtype, _ = _ensure_data(values) + import pdb; pdb.set_trace() check_nulls = not is_integer_dtype(original) labels, uniques = _factorize_array(values, check_nulls, na_sentinel=na_sentinel, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 4443054b81d4b..572a7dffdf965 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -380,6 +380,11 @@ def _values_for_factorize(self): An array suitable for factoraization. This should maintain order and be a supported dtype. By default, the extension array is cast to object dtype. + + Notes + ----- + The value returned by `_values_for_factorized` may be modified + inplace. Make sure it isn't a view on the original data. """ return self.astype(object) @@ -412,22 +417,22 @@ def factorize(self, na_sentinel=-1): ----- :meth:`pandas.factorize` offers a `sort` keyword as well. """ - from pandas.core.algorithms import _factorize_array + # Impelmentor note: There are two ways to override the behavior of + # pandas.factorize + # 1. ExtensionArray._values_for_factories and + # ExtensionArray._from_factorize + # 2. ExtensionArray.factorize + # For the first, you get control over which values are passed to + # pandas' internal factorization method. + from pandas.core.algorithms import factorize import pandas.core.dtypes.common as com from pandas._libs.tslib import iNaT mask = self.isna() arr = self._values_for_factorize() - # Mask values going into the hash table with the appropriate - # NA type. - if com.is_signed_integer_dtype(arr): - arr[mask] = iNaT - elif com.is_float_dtype(arr) or com.is_object_dtype(arr): - arr[mask] = np.nan + labels, uniques = factorize(arr, na_sentinel=na_sentinel, mask=mask) - labels, uniques = _factorize_array(arr, check_nulls=True, - na_sentinel=na_sentinel) uniques = self._from_factorized(uniques, self) return labels, uniques From 872c24a0658a0a3deab91b8b76be46cf8fe34aed Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Mar 2018 11:21:09 -0500 Subject: [PATCH 32/47] ENH: Parametrized NA sentinel for factorize Adds a new keyword `na_value` to control the NA sentinel inside the factorize routine. ```python In [3]: arr = np.array([0, 1, 0, 2], dtype='u8') In [4]: pd.factorize(arr) Out[4]: (array([0, 1, 0, 2]), array([0, 1, 2], dtype=uint64)) In [5]: pd.factorize(arr, na_value=0) Out[5]: (array([-1, 0, -1, 1]), array([1, 2], dtype=uint64)) ``` --- pandas/_libs/hashtable.pxd | 25 ++++++++--- pandas/_libs/hashtable_class_helper.pxi.in | 50 ++++++++++++++++------ pandas/core/algorithms.py | 20 +++++++-- pandas/tests/test_algos.py | 25 +++++++++++ 4 files changed, 97 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index d735b3c0673b2..0599d9f4119be 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -8,32 +8,47 @@ cdef class HashTable: pass cdef class UInt64HashTable(HashTable): - cdef kh_uint64_t *table + cdef: + kh_uint64_t *table + uint64_t na_value + bint use_na_value cpdef get_item(self, uint64_t val) cpdef set_item(self, uint64_t key, Py_ssize_t val) cdef class Int64HashTable(HashTable): - cdef kh_int64_t *table + cdef: + kh_int64_t *table + int64_t na_value + bint use_na_value cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) cdef class Float64HashTable(HashTable): - cdef kh_float64_t *table + cdef: + kh_float64_t *table + float64_t na_value + bint use_na_value cpdef get_item(self, float64_t val) cpdef set_item(self, float64_t key, Py_ssize_t val) cdef class PyObjectHashTable(HashTable): - cdef kh_pymap_t *table + cdef: + kh_pymap_t *table + object na_value + bint use_na_value cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) cdef class StringHashTable(HashTable): - cdef kh_str_t *table + cdef: + kh_str_t *table + object na_value + bint use_na_value cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bca4e388f3279..bf291240a5dc3 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -250,13 +250,13 @@ cdef class HashTable: {{py: -# name, dtype, null_condition, float_group -dtypes = [('Float64', 'float64', 'val != val', True), - ('UInt64', 'uint64', 'False', False), - ('Int64', 'int64', 'val == iNaT', False)] +# name, dtype, null_condition, float_group, default_na_value +dtypes = [('Float64', 'float64', 'val != val', True, 'nan'), + ('UInt64', 'uint64', 'False', False, 0), + ('Int64', 'int64', 'val == iNaT', False, 'iNaT')] def get_dispatch(dtypes): - for (name, dtype, null_condition, float_group) in dtypes: + for (name, dtype, null_condition, float_group, default_na_value) in dtypes: unique_template = """\ cdef: Py_ssize_t i, n = len(values) @@ -300,16 +300,19 @@ def get_dispatch(dtypes): unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group) - yield (name, dtype, null_condition, float_group, unique_template) + yield (name, dtype, null_condition, float_group, default_na_value, unique_template) }} -{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}} +{{for name, dtype, null_condition, float_group, default_na_value, unique_template in get_dispatch(dtypes)}} cdef class {{name}}HashTable(HashTable): - def __cinit__(self, size_hint=1): + def __cinit__(self, size_hint=1, {{dtype}}_t na_value={{default_na_value}}, + bint use_na_value=False): self.table = kh_init_{{dtype}}() + self.na_value = na_value + self.use_na_value = use_na_value if size_hint is not None: kh_resize_{{dtype}}(self.table, size_hint) @@ -414,18 +417,22 @@ cdef class {{name}}HashTable(HashTable): int64_t[:] labels Py_ssize_t idx, count = count_prior int ret = 0 - {{dtype}}_t val + {{dtype}}_t val, na_value khiter_t k {{name}}VectorData *ud + bint use_na_value labels = np.empty(n, dtype=np.int64) ud = uniques.data + na_value = self.na_value + use_na_value = self.use_na_value with nogil: for i in range(n): val = values[i] - if check_null and {{null_condition}}: + if ((check_null and {{null_condition}}) or + (use_na_value and val == na_value)): labels[i] = na_sentinel continue @@ -519,8 +526,11 @@ cdef class StringHashTable(HashTable): # or a sentinel np.nan / None missing value na_string_sentinel = '__nan__' - def __init__(self, int size_hint=1): + def __init__(self, int size_hint=1, object na_value=na_string_sentinel, + bint use_na_value=False): self.table = kh_init_str() + self.na_value = na_value + self.use_na_value = use_na_value if size_hint is not None: kh_resize_str(self.table, size_hint) @@ -706,18 +716,23 @@ cdef class StringHashTable(HashTable): char *v char **vecs khiter_t k + bint use_na_value # these by-definition *must* be strings labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) + na_value = self.na_value + use_na_value = self.use_na_value + # pre-filter out missing # and assign pointers vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - if PyUnicode_Check(val) or PyString_Check(val): + if ((PyUnicode_Check(val) or PyString_Check(val)) and + not (use_na_value and val == na_value)): v = util.get_c_string(val) vecs[i] = v else: @@ -753,8 +768,11 @@ na_sentinel = object cdef class PyObjectHashTable(HashTable): - def __init__(self, size_hint=1): + def __init__(self, size_hint=1, object na_value=na_sentinel, + bint use_na_value=False): self.table = kh_init_pymap() + self.na_value = na_value + self.use_na_value = use_na_value kh_resize_pymap(self.table, size_hint) def __dealloc__(self): @@ -876,14 +894,18 @@ cdef class PyObjectHashTable(HashTable): int ret = 0 object val khiter_t k + bint use_na_value labels = np.empty(n, dtype=np.int64) + na_value = self.na_value + use_na_value = self.use_na_value for i in range(n): val = values[i] hash(val) - if check_null and val != val or val is None: + if ((check_null and val != val or val is None) or + (use_na_value and val == na_value)): labels[i] = na_sentinel continue diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index de2e638265f1e..d21022f5a8c22 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -435,7 +435,8 @@ def isin(comps, values): return f(comps, values) -def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): +def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, + na_value=None): """Factorize an array-like to labels and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -455,7 +456,13 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): """ (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) - table = hash_klass(size_hint or len(values)) + use_na_value = na_value is not None + kwargs = dict(use_na_value=use_na_value) + + if use_na_value: + kwargs['na_value'] = na_value + + table = hash_klass(size_hint or len(values), **kwargs) uniques = vec_klass() labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) @@ -465,7 +472,8 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): @deprecate_kwarg(old_arg_name='order', new_arg_name=None) -def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): +def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, + na_value=None): """ Encode input values as an enumerated type or categorical variable @@ -479,6 +487,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer + na_value : object, optional + A value in `values` to consider missing. Returns ------- @@ -509,9 +519,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): else: values, dtype, _ = _ensure_data(values) check_nulls = not is_integer_dtype(original) + labels, uniques = _factorize_array(values, check_nulls, na_sentinel=na_sentinel, - size_hint=size_hint) + size_hint=size_hint, + na_value=na_value) if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 884b1eb7342c6..0ec28e05e27fc 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -257,6 +257,31 @@ def test_deprecate_order(self): with tm.assert_produces_warning(False): algos.factorize(data) + @pytest.mark.parametrize('data', [ + np.array([0, 1, 0], dtype='u8'), + np.array([-2**63, 1, -2**63], dtype='i8'), + np.array(['__nan__', 'foo', '__nan__'], dtype='object'), + ]) + def test_parametrized_factorize_na_value_default(self, data): + # arrays that include the NA default for that type, but isn't used. + l, u = pd.factorize(data) + expected_uniques = data[[0, 1]] + expected_labels = np.array([0, 1, 0]) + tm.assert_numpy_array_equal(l, expected_labels) + tm.assert_numpy_array_equal(u, expected_uniques) + + @pytest.mark.parametrize('data, na_value', [ + (np.array([0, 1, 0, 2], dtype='u8'), 0), + (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63), + (np.array(['', 'a', '', 'b'], dtype=object), '') + ]) + def test_parametrized_factorize_na_value(self, data, na_value): + l, u = pd.factorize(data, na_value=na_value) + expected_uniques = data[[1, 3]] + expected_labels = np.array([-1, 0, -1, 1]) + tm.assert_numpy_array_equal(l, expected_labels) + tm.assert_numpy_array_equal(u, expected_uniques) + class TestUnique(object): From 3c184285e8ef3fbffd193985f324cfe42e0f3bab Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Mar 2018 15:07:11 -0500 Subject: [PATCH 33/47] REF: Moved to get_labels --- pandas/_libs/hashtable.pxd | 10 ------ pandas/_libs/hashtable_class_helper.pxi.in | 39 ++++++++-------------- pandas/core/algorithms.py | 5 +-- 3 files changed, 16 insertions(+), 38 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 0599d9f4119be..445f4f1c7e751 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -10,8 +10,6 @@ cdef class HashTable: cdef class UInt64HashTable(HashTable): cdef: kh_uint64_t *table - uint64_t na_value - bint use_na_value cpdef get_item(self, uint64_t val) cpdef set_item(self, uint64_t key, Py_ssize_t val) @@ -19,8 +17,6 @@ cdef class UInt64HashTable(HashTable): cdef class Int64HashTable(HashTable): cdef: kh_int64_t *table - int64_t na_value - bint use_na_value cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) @@ -28,8 +24,6 @@ cdef class Int64HashTable(HashTable): cdef class Float64HashTable(HashTable): cdef: kh_float64_t *table - float64_t na_value - bint use_na_value cpdef get_item(self, float64_t val) cpdef set_item(self, float64_t key, Py_ssize_t val) @@ -37,8 +31,6 @@ cdef class Float64HashTable(HashTable): cdef class PyObjectHashTable(HashTable): cdef: kh_pymap_t *table - object na_value - bint use_na_value cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) @@ -47,8 +39,6 @@ cdef class PyObjectHashTable(HashTable): cdef class StringHashTable(HashTable): cdef: kh_str_t *table - object na_value - bint use_na_value cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bf291240a5dc3..af60eb4b1d56d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -308,11 +308,8 @@ def get_dispatch(dtypes): cdef class {{name}}HashTable(HashTable): - def __cinit__(self, size_hint=1, {{dtype}}_t na_value={{default_na_value}}, - bint use_na_value=False): + def __cinit__(self, size_hint=1): self.table = kh_init_{{dtype}}() - self.na_value = na_value - self.use_na_value = use_na_value if size_hint is not None: kh_resize_{{dtype}}(self.table, size_hint) @@ -411,21 +408,20 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True): + bint check_null=True, + {{dtype}}_t na_value={{default_na_value}}, + bint use_na_value=False): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels Py_ssize_t idx, count = count_prior int ret = 0 - {{dtype}}_t val, na_value + {{dtype}}_t val khiter_t k {{name}}VectorData *ud - bint use_na_value labels = np.empty(n, dtype=np.int64) ud = uniques.data - na_value = self.na_value - use_na_value = self.use_na_value with nogil: for i in range(n): @@ -526,11 +522,8 @@ cdef class StringHashTable(HashTable): # or a sentinel np.nan / None missing value na_string_sentinel = '__nan__' - def __init__(self, int size_hint=1, object na_value=na_string_sentinel, - bint use_na_value=False): + def __init__(self, int size_hint=1): self.table = kh_init_str() - self.na_value = na_value - self.use_na_value = use_na_value if size_hint is not None: kh_resize_str(self.table, size_hint) @@ -705,7 +698,9 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=1): + bint check_null=1, + object na_value=na_string_sentinel, + bint use_na_value=False): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -716,15 +711,11 @@ cdef class StringHashTable(HashTable): char *v char **vecs khiter_t k - bint use_na_value # these by-definition *must* be strings labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) - na_value = self.na_value - use_na_value = self.use_na_value - # pre-filter out missing # and assign pointers vecs = malloc(n * sizeof(char *)) @@ -768,11 +759,8 @@ na_sentinel = object cdef class PyObjectHashTable(HashTable): - def __init__(self, size_hint=1, object na_value=na_sentinel, - bint use_na_value=False): + def __init__(self, size_hint=1): self.table = kh_init_pymap() - self.na_value = na_value - self.use_na_value = use_na_value kh_resize_pymap(self.table, size_hint) def __dealloc__(self): @@ -886,7 +874,9 @@ cdef class PyObjectHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): + bint check_null=True, + object na_value=na_sentinel, + bint use_na_value=False): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -894,11 +884,8 @@ cdef class PyObjectHashTable(HashTable): int ret = 0 object val khiter_t k - bint use_na_value labels = np.empty(n, dtype=np.int64) - na_value = self.na_value - use_na_value = self.use_na_value for i in range(n): val = values[i] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d21022f5a8c22..bbf60c2ee7875 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -462,9 +462,10 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, if use_na_value: kwargs['na_value'] = na_value - table = hash_klass(size_hint or len(values), **kwargs) + table = hash_klass(size_hint or len(values)) uniques = vec_klass() - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, + **kwargs) labels = _ensure_platform_int(labels) uniques = uniques.to_array() From 703ab8a2b1a96a89577740b99af7d09375608131 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Mar 2018 16:47:00 -0500 Subject: [PATCH 34/47] Remove python-level use_na_value --- pandas/_libs/hashtable_class_helper.pxi.in | 31 +++++++++++++++------- pandas/core/algorithms.py | 8 +----- pandas/tests/test_algos.py | 4 ++- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index af60eb4b1d56d..3376103d5c2af 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -409,26 +409,37 @@ cdef class {{name}}HashTable(HashTable): def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior, Py_ssize_t na_sentinel, bint check_null=True, - {{dtype}}_t na_value={{default_na_value}}, - bint use_na_value=False): + object na_value=None): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels Py_ssize_t idx, count = count_prior int ret = 0 - {{dtype}}_t val + {{dtype}}_t val, na_value2 khiter_t k {{name}}VectorData *ud + bint use_na_value labels = np.empty(n, dtype=np.int64) ud = uniques.data + use_na_value = na_value is not None + + if use_na_value: + # We need this na_value2 because we want to allow users + # to *optionally* specify an NA sentinel *of the correct* type. + # We use None, to make it optional, which requires `object` type + # for the parameter. To please the compiler, we use na_value2, + # which is only used if it's *specified*. + na_value2 = <{{dtype}}_t>na_value + else: + na_value2 = {{default_na_value}} with nogil: for i in range(n): val = values[i] if ((check_null and {{null_condition}}) or - (use_na_value and val == na_value)): + (use_na_value and val == na_value2)): labels[i] = na_sentinel continue @@ -699,22 +710,23 @@ cdef class StringHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, bint check_null=1, - object na_value=na_string_sentinel, - bint use_na_value=False): + object na_value=None): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels int64_t[:] uindexer Py_ssize_t idx, count = count_prior int ret = 0 - object val + object val, na_value2 char *v char **vecs khiter_t k + bint use_na_value # these by-definition *must* be strings labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) + use_na_value = na_value is not None # pre-filter out missing # and assign pointers @@ -875,8 +887,7 @@ cdef class PyObjectHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, bint check_null=True, - object na_value=na_sentinel, - bint use_na_value=False): + object na_value=None): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -884,8 +895,10 @@ cdef class PyObjectHashTable(HashTable): int ret = 0 object val khiter_t k + bint use_na_value labels = np.empty(n, dtype=np.int64) + use_na_value = na_value is not None for i in range(n): val = values[i] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bbf60c2ee7875..86a33faf5449f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -456,16 +456,10 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, """ (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) - use_na_value = na_value is not None - kwargs = dict(use_na_value=use_na_value) - - if use_na_value: - kwargs['na_value'] = na_value - table = hash_klass(size_hint or len(values)) uniques = vec_klass() labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, - **kwargs) + na_value=na_value) labels = _ensure_platform_int(labels) uniques = uniques.to_array() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 0ec28e05e27fc..c7ab3c53613ee 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -272,8 +272,10 @@ def test_parametrized_factorize_na_value_default(self, data): @pytest.mark.parametrize('data, na_value', [ (np.array([0, 1, 0, 2], dtype='u8'), 0), + (np.array([1, 0, 1, 2], dtype='u8'), 1), (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63), - (np.array(['', 'a', '', 'b'], dtype=object), '') + (np.array([1, -2**63, 1, 0], dtype='i8'), 1), + (np.array(['a', '', 'a', 'b'], dtype=object), 'a') ]) def test_parametrized_factorize_na_value(self, data, na_value): l, u = pd.factorize(data, na_value=na_value) From ab32e0fe3922e2b4743c68ec0fed535910ef6137 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Mar 2018 20:24:38 -0500 Subject: [PATCH 35/47] REF: More cleanup Removed null_condition from the template. Had Categorical use na_value --- pandas/_libs/hashtable_class_helper.pxi.in | 19 +++++++++---------- pandas/core/algorithms.py | 2 +- pandas/core/arrays/categorical.py | 4 ++-- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3376103d5c2af..b21954f9a6601 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -250,13 +250,13 @@ cdef class HashTable: {{py: -# name, dtype, null_condition, float_group, default_na_value -dtypes = [('Float64', 'float64', 'val != val', True, 'nan'), - ('UInt64', 'uint64', 'False', False, 0), - ('Int64', 'int64', 'val == iNaT', False, 'iNaT')] +# name, dtype, float_group, default_na_value +dtypes = [('Float64', 'float64', True, 'nan'), + ('UInt64', 'uint64', False, 0), + ('Int64', 'int64', False, 'iNaT')] def get_dispatch(dtypes): - for (name, dtype, null_condition, float_group, default_na_value) in dtypes: + for (name, dtype, float_group, default_na_value) in dtypes: unique_template = """\ cdef: Py_ssize_t i, n = len(values) @@ -298,13 +298,13 @@ def get_dispatch(dtypes): return uniques.to_array() """ - unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group) + unique_template = unique_template.format(name=name, dtype=dtype, float_group=float_group) - yield (name, dtype, null_condition, float_group, default_na_value, unique_template) + yield (name, dtype, float_group, default_na_value, unique_template) }} -{{for name, dtype, null_condition, float_group, default_na_value, unique_template in get_dispatch(dtypes)}} +{{for name, dtype, float_group, default_na_value, unique_template in get_dispatch(dtypes)}} cdef class {{name}}HashTable(HashTable): @@ -438,8 +438,7 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if ((check_null and {{null_condition}}) or - (use_na_value and val == na_value2)): + if check_null and (val != val or val == na_value2): labels[i] = na_sentinel continue diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 86a33faf5449f..88212a4d69bdc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -513,7 +513,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - check_nulls = not is_integer_dtype(original) + check_nulls = na_value is not None or not is_integer_dtype(original) labels, uniques = _factorize_array(values, check_nulls, na_sentinel=na_sentinel, diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6eadef37da344..7a0da69fdbcea 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2163,11 +2163,11 @@ def factorize(self, na_sentinel=-1): from pandas.core.algorithms import _factorize_array codes = self.codes.astype('int64') - codes[codes == -1] = iNaT # We set missing codes, normally -1, to iNaT so that the # Int64HashTable treats them as missing values. labels, uniques = _factorize_array(codes, check_nulls=True, - na_sentinel=na_sentinel) + na_sentinel=na_sentinel, + na_value=-1) uniques = self._constructor(self.categories.take(uniques), categories=self.categories, ordered=self.ordered) From 62fa538963cfd6364f569b5a19d9d06718650058 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 06:38:22 -0500 Subject: [PATCH 36/47] API: Make it non-public --- pandas/core/algorithms.py | 17 ++++++++++------- pandas/tests/test_algos.py | 5 +++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 88212a4d69bdc..dfefa7cbe3a82 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -446,15 +446,22 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, values : ndarray check_nulls : bool Whether to check for nulls in the hashtable's 'get_labels' method. + Nulls are always checked when `na_value` is specified. na_sentinel : int, default -1 size_hint : int, optional Passsed through to the hashtable's 'get_labels' method + na_value : object, optional + A value in `values` to consider missing. Note: only use this + parameter when you know that you don't have any values pandas would + consider missing in the array (NaN for float data, iNaT for + datetimes, etc.). Returns ------- labels, uniques : ndarray """ (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) + check_nulls = check_nulls or na_value is not None table = hash_klass(size_hint or len(values)) uniques = vec_klass() @@ -467,8 +474,7 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, @deprecate_kwarg(old_arg_name='order', new_arg_name=None) -def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, - na_value=None): +def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable @@ -482,8 +488,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer - na_value : object, optional - A value in `values` to consider missing. Returns ------- @@ -513,12 +517,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - check_nulls = na_value is not None or not is_integer_dtype(original) + check_nulls = not is_integer_dtype(original) labels, uniques = _factorize_array(values, check_nulls, na_sentinel=na_sentinel, - size_hint=size_hint, - na_value=na_value) + size_hint=size_hint) if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c7ab3c53613ee..fa3f6e775da28 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -264,7 +264,7 @@ def test_deprecate_order(self): ]) def test_parametrized_factorize_na_value_default(self, data): # arrays that include the NA default for that type, but isn't used. - l, u = pd.factorize(data) + l, u = algos.factorize(data) expected_uniques = data[[0, 1]] expected_labels = np.array([0, 1, 0]) tm.assert_numpy_array_equal(l, expected_labels) @@ -278,7 +278,8 @@ def test_parametrized_factorize_na_value_default(self, data): (np.array(['a', '', 'a', 'b'], dtype=object), 'a') ]) def test_parametrized_factorize_na_value(self, data, na_value): - l, u = pd.factorize(data, na_value=na_value) + l, u = algos._factorize_array(data, check_nulls=True, + na_value=na_value) expected_uniques = data[[1, 3]] expected_labels = np.array([-1, 0, -1, 1]) tm.assert_numpy_array_equal(l, expected_labels) From 28fad508371466992b65a8e8cefb4866170734aa Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 06:54:30 -0500 Subject: [PATCH 37/47] Revert formatting changes in pxd --- pandas/_libs/hashtable.pxd | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 445f4f1c7e751..d735b3c0673b2 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -8,37 +8,32 @@ cdef class HashTable: pass cdef class UInt64HashTable(HashTable): - cdef: - kh_uint64_t *table + cdef kh_uint64_t *table cpdef get_item(self, uint64_t val) cpdef set_item(self, uint64_t key, Py_ssize_t val) cdef class Int64HashTable(HashTable): - cdef: - kh_int64_t *table + cdef kh_int64_t *table cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) cdef class Float64HashTable(HashTable): - cdef: - kh_float64_t *table + cdef kh_float64_t *table cpdef get_item(self, float64_t val) cpdef set_item(self, float64_t key, Py_ssize_t val) cdef class PyObjectHashTable(HashTable): - cdef: - kh_pymap_t *table + cdef kh_pymap_t *table cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) cdef class StringHashTable(HashTable): - cdef: - kh_str_t *table + cdef kh_str_t *table cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) From 85807545ef2a93ff2585920fc233fcb5bc3c2959 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 07:00:36 -0500 Subject: [PATCH 38/47] linting --- pandas/core/arrays/categorical.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7a0da69fdbcea..b25f23cde5c49 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -7,7 +7,6 @@ from pandas import compat from pandas.compat import u, lzip from pandas._libs import lib, algos as libalgos -from pandas._libs.tslib import iNaT from pandas.core.dtypes.generic import ( ABCSeries, ABCIndexClass, ABCCategoricalIndex) From cf14ee18451333c95ae03e0017ca34150f1d8b25 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 07:35:54 -0500 Subject: [PATCH 39/47] Handle bool --- pandas/core/algorithms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index dfefa7cbe3a82..bf192cdb2c300 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -517,7 +517,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - check_nulls = not is_integer_dtype(original) + check_nulls = (not is_integer_dtype(original) and + not is_bool_dtype(original)) labels, uniques = _factorize_array(values, check_nulls, na_sentinel=na_sentinel, From a23d451e08b43ba00f4b3ea8a87f2074e0ceda11 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 13:55:56 -0500 Subject: [PATCH 40/47] Specify dtypes --- pandas/tests/test_algos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index fa3f6e775da28..ecad2cc042cb3 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -266,7 +266,7 @@ def test_parametrized_factorize_na_value_default(self, data): # arrays that include the NA default for that type, but isn't used. l, u = algos.factorize(data) expected_uniques = data[[0, 1]] - expected_labels = np.array([0, 1, 0]) + expected_labels = np.array([0, 1, 0], dtype='i8') tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) @@ -281,7 +281,7 @@ def test_parametrized_factorize_na_value(self, data, na_value): l, u = algos._factorize_array(data, check_nulls=True, na_value=na_value) expected_uniques = data[[1, 3]] - expected_labels = np.array([-1, 0, -1, 1]) + expected_labels = np.array([-1, 0, -1, 1], dtype='i8') tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) From b25f3d498cf6d2cd7ea82cab3cd32e4e8974139b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 17:03:29 -0500 Subject: [PATCH 41/47] Remove unused variable. Added PyObject hashtable test --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- pandas/tests/test_algos.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b21954f9a6601..c72f644823ccb 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -716,7 +716,7 @@ cdef class StringHashTable(HashTable): int64_t[:] uindexer Py_ssize_t idx, count = count_prior int ret = 0 - object val, na_value2 + object val char *v char **vecs khiter_t k diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ecad2cc042cb3..5c42caa1720bb 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -275,7 +275,10 @@ def test_parametrized_factorize_na_value_default(self, data): (np.array([1, 0, 1, 2], dtype='u8'), 1), (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63), (np.array([1, -2**63, 1, 0], dtype='i8'), 1), - (np.array(['a', '', 'a', 'b'], dtype=object), 'a') + (np.array(['a', '', 'a', 'b'], dtype=object), 'a'), + (np.array([(), ('a', 1), (), ('a', 2)], dtype=object), ()), + (np.array([('a', 1), (), ('a', 1), ('a', 2)], dtype=object), + ('a', 1)), ]) def test_parametrized_factorize_na_value(self, data, na_value): l, u = algos._factorize_array(data, check_nulls=True, From dfcda85521502147b4e51d8300e05de7cb78b9dd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Mar 2018 19:09:01 -0500 Subject: [PATCH 42/47] REF: Removed check_nulls --- pandas/_libs/hashtable_class_helper.pxi.in | 7 ++----- pandas/core/algorithms.py | 21 +++++++++++---------- pandas/core/arrays/categorical.py | 3 +-- pandas/tests/test_algos.py | 3 +-- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c72f644823ccb..eca66f78499db 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -408,7 +408,6 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True, object na_value=None): cdef: Py_ssize_t i, n = len(values) @@ -438,7 +437,7 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if check_null and (val != val or val == na_value2): + if val != val or (use_na_value and val == na_value2): labels[i] = na_sentinel continue @@ -708,7 +707,6 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=1, object na_value=None): cdef: Py_ssize_t i, n = len(values) @@ -885,7 +883,6 @@ cdef class PyObjectHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True, object na_value=None): cdef: Py_ssize_t i, n = len(values) @@ -903,7 +900,7 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - if ((check_null and val != val or val is None) or + if ((val != val or val is None) or (use_na_value and val == na_value)): labels[i] = na_sentinel continue diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bf192cdb2c300..3b75b189c0123 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -435,7 +435,7 @@ def isin(comps, values): return f(comps, values) -def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, +def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None): """Factorize an array-like to labels and uniques. @@ -444,9 +444,6 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, Parameters ---------- values : ndarray - check_nulls : bool - Whether to check for nulls in the hashtable's 'get_labels' method. - Nulls are always checked when `na_value` is specified. na_sentinel : int, default -1 size_hint : int, optional Passsed through to the hashtable's 'get_labels' method @@ -461,11 +458,10 @@ def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None, labels, uniques : ndarray """ (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) - check_nulls = check_nulls or na_value is not None table = hash_klass(size_hint or len(values)) uniques = vec_klass() - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, + labels = table.get_labels(values, uniques, 0, na_sentinel, na_value=na_value) labels = _ensure_platform_int(labels) @@ -517,12 +513,17 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - check_nulls = (not is_integer_dtype(original) and - not is_bool_dtype(original)) - labels, uniques = _factorize_array(values, check_nulls, + if (is_datetime64_any_dtype(original) or + is_timedelta64_dtype(original)): + na_value = iNaT + else: + na_value = None + + labels, uniques = _factorize_array(values, na_sentinel=na_sentinel, - size_hint=size_hint) + size_hint=size_hint, + na_value=na_value) if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b25f23cde5c49..ac57660300be4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2164,8 +2164,7 @@ def factorize(self, na_sentinel=-1): codes = self.codes.astype('int64') # We set missing codes, normally -1, to iNaT so that the # Int64HashTable treats them as missing values. - labels, uniques = _factorize_array(codes, check_nulls=True, - na_sentinel=na_sentinel, + labels, uniques = _factorize_array(codes, na_sentinel=na_sentinel, na_value=-1) uniques = self._constructor(self.categories.take(uniques), categories=self.categories, diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 5c42caa1720bb..ada4f880e92a4 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -281,8 +281,7 @@ def test_parametrized_factorize_na_value_default(self, data): ('a', 1)), ]) def test_parametrized_factorize_na_value(self, data, na_value): - l, u = algos._factorize_array(data, check_nulls=True, - na_value=na_value) + l, u = algos._factorize_array(data, na_value=na_value) expected_uniques = data[[1, 3]] expected_labels = np.array([-1, 0, -1, 1], dtype='i8') tm.assert_numpy_array_equal(l, expected_labels) From eaff342dbce56d8b1b8ea09f840477e27754c31b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Mar 2018 06:37:09 -0500 Subject: [PATCH 43/47] BUG: NaT for period --- pandas/core/algorithms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3b75b189c0123..fcc07dfa3f3a8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -515,7 +515,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): values, dtype, _ = _ensure_data(values) if (is_datetime64_any_dtype(original) or - is_timedelta64_dtype(original)): + is_timedelta64_dtype(original) or + is_period_dtype(original)): na_value = iNaT else: na_value = None From e786253a1e377ed8950268878ef42c2d86443d54 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Mar 2018 06:51:09 -0500 Subject: [PATCH 44/47] Other hashtable --- pandas/_libs/hashtable.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 07b4b80603e03..15d93374da3a9 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -70,7 +70,7 @@ cdef class Factorizer: return self.count def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1, - check_null=True): + na_value=None): """ Factorize values with nans replaced by na_sentinel >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) @@ -81,7 +81,7 @@ cdef class Factorizer: uniques.extend(self.uniques.to_array()) self.uniques = uniques labels = self.table.get_labels(values, self.uniques, - self.count, na_sentinel, check_null) + self.count, na_sentinel, na_value) mask = (labels == na_sentinel) # sort on if sort: @@ -114,7 +114,7 @@ cdef class Int64Factorizer: return self.count def factorize(self, int64_t[:] values, sort=False, - na_sentinel=-1, check_null=True): + na_sentinel=-1, na_value=None): """ Factorize values with nans replaced by na_sentinel >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) @@ -126,7 +126,7 @@ cdef class Int64Factorizer: self.uniques = uniques labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, - check_null) + na_value=na_value) # sort on if sort: From 465d458573c99ee218299c2713c2e5abd305907c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Mar 2018 07:14:48 -0500 Subject: [PATCH 45/47] na_value_for_dtype PeriodDtype --- pandas/core/algorithms.py | 4 +-- pandas/core/dtypes/missing.py | 3 ++- pandas/tests/dtypes/test_missing.py | 41 ++++++++++++++++------------- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index fcc07dfa3f3a8..45f86f044a4b2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -29,7 +29,7 @@ _ensure_float64, _ensure_uint64, _ensure_int64) from pandas.compat.numpy import _np_version_under1p10 -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas.core import common as com from pandas._libs import algos, lib, hashtable as htable @@ -517,7 +517,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if (is_datetime64_any_dtype(original) or is_timedelta64_dtype(original) or is_period_dtype(original)): - na_value = iNaT + na_value = na_value_for_dtype(original.dtype) else: na_value = None diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 01c88c269e7e0..7be00cbfd567a 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -11,6 +11,7 @@ is_datetimelike_v_numeric, is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_interval_dtype, + is_period_dtype, is_complex_dtype, is_string_like_dtype, is_bool_dtype, is_integer_dtype, is_dtype_equal, @@ -393,7 +394,7 @@ def na_value_for_dtype(dtype, compat=True): dtype = pandas_dtype(dtype) if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or - is_timedelta64_dtype(dtype)): + is_timedelta64_dtype(dtype) or is_period_dtype(dtype)): return NaT elif is_float_dtype(dtype): return np.nan diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 4f208bc352c70..365d8d762d673 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -15,7 +15,8 @@ from pandas import (NaT, Float64Index, Series, DatetimeIndex, TimedeltaIndex, date_range) from pandas.core.dtypes.common import is_scalar -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, PeriodDtype, IntervalDtype) from pandas.core.dtypes.missing import ( array_equivalent, isna, notna, isnull, notnull, na_value_for_dtype) @@ -311,23 +312,27 @@ def test_array_equivalent_str(): np.array(['A', 'X'], dtype=dtype)) -def test_na_value_for_dtype(): - for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), - DatetimeTZDtype('datetime64[ns, US/Eastern]')]: - assert na_value_for_dtype(dtype) is NaT - - for dtype in ['u1', 'u2', 'u4', 'u8', - 'i1', 'i2', 'i4', 'i8']: - assert na_value_for_dtype(np.dtype(dtype)) == 0 - - for dtype in ['bool']: - assert na_value_for_dtype(np.dtype(dtype)) is False - - for dtype in ['f2', 'f4', 'f8']: - assert np.isnan(na_value_for_dtype(np.dtype(dtype))) - - for dtype in ['O']: - assert np.isnan(na_value_for_dtype(np.dtype(dtype))) +@pytest.mark.parametrize('dtype, na_value', [ + # Datetime-like + (np.dtype("M8[ns]"), NaT), + (np.dtype("m8[ns]"), NaT), + (DatetimeTZDtype('datetime64[ns, US/Eastern]'), NaT), + (PeriodDtype("M"), NaT), + # Integer + ('u1', 0), ('u2', 0), ('u4', 0), ('u8', 0), + ('i1', 0), ('i2', 0), ('i4', 0), ('i8', 0), + # Bool + ('bool', False), + # Float + ('f2', np.nan), ('f4', np.nan), ('f8', np.nan), + # Object + ('O', np.nan), + # Interval + (IntervalDtype(), np.nan), +]) +def test_na_value_for_dtype(dtype, na_value): + result = na_value_for_dtype(dtype) + assert result is na_value class TestNAObj(object): From 69c3ea2c5776de2481f34f244cfb29c30065bfc0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Mar 2018 09:23:34 -0500 Subject: [PATCH 46/47] Use na_value --- pandas/core/arrays/base.py | 47 ++++++++++++++-------------- pandas/core/arrays/categorical.py | 2 +- pandas/tests/extension/json/array.py | 2 +- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 572a7dffdf965..e999a326b21c9 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -256,7 +256,7 @@ def isna(self): raise AbstractMethodError(self) def _values_for_argsort(self): - # type: () -> ndarray + # type: () -> Tuple[ndarray, object] """Return values for sorting. Returns @@ -372,6 +372,7 @@ def unique(self): return self._constructor_from_sequence(uniques) def _values_for_factorize(self): + # type: () -> Tuple[ndarray, Any] """Return an array and missing value suitable for factorization. Returns @@ -380,15 +381,15 @@ def _values_for_factorize(self): An array suitable for factoraization. This should maintain order and be a supported dtype. By default, the extension array is cast to object dtype. - - Notes - ----- - The value returned by `_values_for_factorized` may be modified - inplace. Make sure it isn't a view on the original data. + na_value : object + The value in `values` to consider missing. This will be treated + as NA in the factorization routines, so it will be coded as + `na_sentinal` and not included in `uniques`. """ - return self.astype(object) + return self.astype(object), np.nan def factorize(self, na_sentinel=-1): + # type: (int) -> Tuple[ndarray, ExtensionArray] """Encode the extension array as an enumerated type. Parameters @@ -406,12 +407,13 @@ def factorize(self, na_sentinel=-1): .. note:: - uniques should *not* contain a value for the NA sentinel, - if values in `self` are missing. + uniques will *not* contain an entry for the NA value of + the ExtensionArray if there are any missing values present + in `self`. See Also -------- - pandas.factorize : top-level factorize method that dispatches here. + pandas.factorize : Top-level factorize method that dispatches here. Notes ----- @@ -419,19 +421,18 @@ def factorize(self, na_sentinel=-1): """ # Impelmentor note: There are two ways to override the behavior of # pandas.factorize - # 1. ExtensionArray._values_for_factories and - # ExtensionArray._from_factorize - # 2. ExtensionArray.factorize - # For the first, you get control over which values are passed to - # pandas' internal factorization method. - from pandas.core.algorithms import factorize - import pandas.core.dtypes.common as com - from pandas._libs.tslib import iNaT - - mask = self.isna() - arr = self._values_for_factorize() - - labels, uniques = factorize(arr, na_sentinel=na_sentinel, mask=mask) + # 1. _values_for_factorize and _from_factorize. + # Specify the values passed to pandas' internal factorization + # routines, and how to convert from those values back to the + # original ExtensionArray. + # 2. ExtensionArray.factorize. + # Complete control over factorization. + from pandas.core.algorithms import _factorize_array + + arr, na_value = self._values_for_factorize() + + labels, uniques = _factorize_array(arr, na_sentinel=na_sentinel, + na_value=na_value) uniques = self._from_factorized(uniques, self) return labels, uniques diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index aaf4d9d5b6198..b5a4785fd98a6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2120,7 +2120,7 @@ def unique(self): def _values_for_factorize(self): codes = self.codes.astype('int64') - return codes + return codes, -1 @classmethod def _from_factorized(cls, uniques, original): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 32da0acf6fb48..51a68a3701046 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -114,7 +114,7 @@ def _concat_same_type(cls, to_concat): def _values_for_factorize(self): frozen = tuple(tuple(x.items()) for x in self) - return np.array(frozen, dtype=object) + return np.array(frozen, dtype=object), () def _values_for_argsort(self): # Disable NumPy's shape inference by including an empty tuple... From c06da3a7cc52240159cda296649627d49b5f4bb4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 27 Mar 2018 06:17:49 -0500 Subject: [PATCH 47/47] Typing Spacing Docs --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/base.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2e2356137fd4a..065a5782aced1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -601,8 +601,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): # should happen here. values = _ensure_arraylike(values) - original = values + if is_extension_array_dtype(values): values = getattr(values, '_values', values) labels, uniques = values.factorize(na_sentinel=na_sentinel) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e999a326b21c9..c281bd80cb274 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -256,7 +256,7 @@ def isna(self): raise AbstractMethodError(self) def _values_for_argsort(self): - # type: () -> Tuple[ndarray, object] + # type: () -> ndarray """Return values for sorting. Returns @@ -379,12 +379,13 @@ def _values_for_factorize(self): ------- values : ndarray An array suitable for factoraization. This should maintain order - and be a supported dtype. By default, the extension array is cast - to object dtype. + and be a supported dtype (Float64, Int64, UInt64, String, Object). + By default, the extension array is cast to object dtype. na_value : object The value in `values` to consider missing. This will be treated as NA in the factorization routines, so it will be coded as - `na_sentinal` and not included in `uniques`. + `na_sentinal` and not included in `uniques`. By default, + ``np.nan`` is used. """ return self.astype(object), np.nan