pandas-dev · jreback · Mar 27, 2018 · Feb 19, 2018 · Mar 2, 2018 · Mar 2, 2018
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -146,10 +146,9 @@ def _reconstruct_data(values, dtype, original):
     Returns
     -------
     Index for extension types, otherwise ndarray casted to dtype
-
     """
     from pandas import Index
-    if is_categorical_dtype(dtype):
+    if is_extension_array_dtype(dtype):
         pass
     elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
         values = Index(original)._shallow_copy(values, name=None)
@@ -502,9 +501,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     values = _ensure_arraylike(values)
     original = values
 
-    if is_categorical_dtype(values):
+    if is_extension_array_dtype(values):
         values = getattr(values, '_values', values)
-        labels, uniques = values.factorize()
+        labels, uniques = values.factorize(na_sentinel=na_sentinel)
         dtype = original.dtype
     else:
         values, dtype, _ = _ensure_data(values)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -248,6 +248,41 @@ def unique(self):
         uniques = unique(self.astype(object))
         return self._constructor_from_sequence(uniques)
 
+    def factorize(self, na_sentinel=-1):
+        """Encode the extension array as an enumerated type.
+
+        Parameters
+        ----------
+        na_sentinel : int, default -1
+            Value to use in the `labels` array to indicate missing values.
+
+        Returns
+        -------
+        labels : ndarray
+            An interger NumPy array that's an indexer into the original
+            ExtensionArray
+        uniques : ExtensionArray
+            An ExtensionArray containing the unique values of `self`.
+
+        See Also
+        --------
+        pandas.factorize : top-level factorize method that dispatches here.
+
+        Notes
+        -----
+        :meth:`pandas.factorize` offers a `sort` keyword as well.
+        """
+        from pandas.core.algorithms import _factorize_array
+
+        mask = self.isna()
+        arr = self.astype(object)
+        arr[mask] = np.nan
+
+        labels, uniques = _factorize_array(arr, check_nulls=True,
+                                           na_sentinel=na_sentinel)
+        uniques = self._constructor_from_sequence(uniques)
+        return labels, uniques
+
     # ------------------------------------------------------------------------
     # Indexing methods
     # ------------------------------------------------------------------------

diff --git a/pandas/tests/extension/base/base.py b/pandas/tests/extension/base/base.py
@@ -4,3 +4,6 @@
 class BaseExtensionTests(object):
     assert_series_equal = staticmethod(tm.assert_series_equal)
     assert_frame_equal = staticmethod(tm.assert_frame_equal)
+    assert_extension_array_equal = staticmethod(
+        tm.assert_extension_array_equal
+    )
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -2,6 +2,7 @@
 import numpy as np
 
 import pandas as pd
+import pandas.util.testing as tm
 
 from .base import BaseExtensionTests
 
@@ -42,3 +43,22 @@ def test_unique(self, data, box, method):
         assert len(result) == 1
         assert isinstance(result, type(data))
         assert result[0] == duplicated[0]
+
+    @pytest.mark.parametrize('na_sentinel', [-1, -2])
+    def test_factorize(self, data_for_grouping, na_sentinel):
+        labels, uniques = pd.factorize(data_for_grouping,
+                                       na_sentinel=na_sentinel)
+        expected_labels = np.array([0, 0, na_sentinel,
+                                   na_sentinel, 1, 1, 0, 2],
+                                   dtype='int64')
+        expected_uniques = data_for_grouping.take([0, 4, 7])
+
+        tm.assert_numpy_array_equal(labels, expected_labels)
+        self.assert_extension_array_equal(uniques, expected_uniques)
+
+    def test_factorize_equivalence(self, data_for_grouping):
+        l1, u1 = pd.factorize(data_for_grouping)
+        l2, u2 = pd.factorize(data_for_grouping)
+
+        tm.assert_numpy_array_equal(l1, l2)
+        self.assert_extension_array_equal(u1, u2)
diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py
@@ -34,6 +34,11 @@ def na_value():
     return np.nan
 
 
+@pytest.fixture
+def data_for_grouping():
+    return Categorical(['a', 'a', None, None, 'b', 'b', 'a', 'c'])
+
+
 class TestDtype(base.BaseDtypeTests):
     pass
 

diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py
@@ -46,3 +46,14 @@ def na_cmp():
 def na_value():
     """The scalar missing value for this type. Default 'None'"""
     return None
+
+
+@pytest.fixture
+def data_for_grouping():
+    """Data for factorization, grouping, and unique tests.
+
+    Expected to be like [B, B, NA, NA, A, A, B, C]
+
+    Where A < B < C and NA is missing
+    """
+    raise NotImplementedError
diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
@@ -35,6 +35,15 @@ def na_value():
     return decimal.Decimal("NaN")
 
 
+@pytest.fixture
+def data_for_grouping():
+    b = decimal.Decimal('1.0')
+    a = decimal.Decimal('0.0')
+    c = decimal.Decimal('2.0')
+    na = decimal.Decimal('NaN')
+    return DecimalArray([b, b, na, na, a, a, b, c])
+
+
 class TestDtype(base.BaseDtypeTests):
     pass
 

diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 
+import pandas as pd
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.arrays import ExtensionArray
 
@@ -104,6 +105,21 @@ def _concat_same_type(cls, to_concat):
         data = list(itertools.chain.from_iterable([x.data for x in to_concat]))
         return cls(data)
 
+    def factorize(self, na_sentinel=-1):
+        frozen = tuple(tuple(x.items()) for x in self)
+        labels, uniques = pd.factorize(frozen)
+
+        # fixup NA
+        if self.isna().any():
+            na_code = self.isna().argmax()
+
+            labels[labels == na_code] = na_sentinel
+            labels[labels > na_code] -= 1
+
+        uniques = JSONArray([collections.UserDict(x)
+                             for x in uniques if x != ()])
+        return labels, uniques
+
 
 def make_data():
     # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer

diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
@@ -39,6 +39,17 @@ def na_cmp():
     return operator.eq
 
 
+@pytest.fixture
+def data_for_grouping():
+    return JSONArray([
+        {'b': 1}, {'b': 1},
+        {}, {},
+        {'a': 0, 'c': 2}, {'a': 0, 'c': 2},
+        {'b': 1},
+        {'c': 2},
+    ])
+
+
 class TestDtype(base.BaseDtypeTests):
     pass
 
@@ -64,8 +75,10 @@ class TestMissing(base.BaseMissingTests):
 
 
 class TestMethods(base.BaseMethodsTests):
-    @pytest.mark.skip(reason="Unhashable")
-    def test_value_counts(self, all_data, dropna):
+    unhashable = pytest.mark.skip(reason="Unhashable")
+
+    @unhashable
+    def test_factorize(self):
         pass
 
 

diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 import pandas as pd
+from pandas.core.arrays.base import ExtensionArray
 from pandas.core.dtypes.missing import array_equivalent
 from pandas.core.dtypes.common import (
     is_datetimelike_v_numeric,
@@ -1083,6 +1084,32 @@ def _raise(left, right, err_msg):
     return True
 
 
+def assert_extension_array_equal(left, right):
+    """Check that left and right ExtensionArrays are equal.
+
+    Parameters
+    ----------
+    left, right : ExtensionArray
+        The two arrays to compare
+
+    Notes
+    -----
+    Missing values are checked separately from valid values.
+    A mask of missing values is computed for each and checked to match.
+    The remaining all-valid values are cast to object dtype and checked.
+    """
+    assert isinstance(left, ExtensionArray)
+    assert left.dtype == right.dtype
+    left_na = left.isna()
+    right_na = right.isna()
+    assert_numpy_array_equal(left_na, right_na)
+
+    left_valid = left[~left_na].astype(object)
+    right_valid = right[~right_na].astype(object)
+
+    assert_numpy_array_equal(left_valid, right_valid)
+
+
 # This could be refactored to use the NDFrame.equals method
 def assert_series_equal(left, right, check_dtype=True,
                         check_index_type='equiv',