REF/BUG/API: factorizing categorical data (#19938)

TomAugspurger · jreback · commit 38afa9310040 · 2018-03-15T06:40:58.000-04:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -843,6 +843,8 @@ Categorical
 - Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`)
 - Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`)
 - Bug in :class:`Series` constructor with scalar and ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19565`)
+- Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`)
+- Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`)
 
 Datetimelike
 ^^^^^^^^^^^^
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -435,15 +435,45 @@ def isin(comps, values):
     return f(comps, values)
 
 
+def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None):
+    """Factorize an array-like to labels and uniques.
+
+    This doesn't do any coercion of types or unboxing before factorization.
+
+    Parameters
+    ----------
+    values : ndarray
+    check_nulls : bool
+        Whether to check for nulls in the hashtable's 'get_labels' method.
+    na_sentinel : int, default -1
+    size_hint : int, optional
+        Passsed through to the hashtable's 'get_labels' method
+
+    Returns
+    -------
+    labels, uniques : ndarray
+    """
+    (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
+
+    table = hash_klass(size_hint or len(values))
+    uniques = vec_klass()
+    labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
+
+    labels = _ensure_platform_int(labels)
+    uniques = uniques.to_array()
+    return labels, uniques
+
+
 @deprecate_kwarg(old_arg_name='order', new_arg_name=None)
 def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     """
     Encode input values as an enumerated type or categorical variable
 
     Parameters
     ----------
-    values : ndarray (1-d)
-        Sequence
+    values : Sequence
+        ndarrays must be 1-D. Sequences that aren't pandas objects are
+        coereced to ndarrays before factorization.
     sort : boolean, default False
         Sort by values
     na_sentinel : int, default -1
@@ -458,26 +488,43 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
         Series
 
     note: an array of Periods will ignore sort as it returns an always sorted
-    PeriodIndex
+    PeriodIndex.
     """
+    # Implementation notes: This method is responsible for 3 things
+    # 1.) coercing data to array-like (ndarray, Index, extension array)
+    # 2.) factorizing labels and uniques
+    # 3.) Maybe boxing the output in an Index
+    #
+    # Step 2 is dispatched to extension types (like Categorical). They are
+    # responsible only for factorization. All data coercion, sorting and boxing
+    # should happen here.
 
     values = _ensure_arraylike(values)
     original = values
-    values, dtype, _ = _ensure_data(values)
-    (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
-
-    table = hash_klass(size_hint or len(values))
-    uniques = vec_klass()
-    check_nulls = not is_integer_dtype(original)
-    labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
 
-    labels = _ensure_platform_int(labels)
-    uniques = uniques.to_array()
+    if is_categorical_dtype(values):
+        values = getattr(values, '_values', values)
+        labels, uniques = values.factorize()
+        dtype = original.dtype
+    else:
+        values, dtype, _ = _ensure_data(values)
+        check_nulls = not is_integer_dtype(original)
+        labels, uniques = _factorize_array(values, check_nulls,
+                                           na_sentinel=na_sentinel,
+                                           size_hint=size_hint)
 
     if sort and len(uniques) > 0:
         from pandas.core.sorting import safe_sort
-        uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
-                                    assume_unique=True)
+        try:
+            order = uniques.argsort()
+            order2 = order.argsort()
+            labels = take_1d(order2, labels, fill_value=na_sentinel)
+            uniques = uniques.take(order)
+        except TypeError:
+            # Mixed types, where uniques.argsort fails.
+            uniques, labels = safe_sort(uniques, labels,
+                                        na_sentinel=na_sentinel,
+                                        assume_unique=True)
 
     uniques = _reconstruct_data(uniques, dtype, original)
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -7,6 +7,7 @@
 from pandas import compat
 from pandas.compat import u, lzip
 from pandas._libs import lib, algos as libalgos
+from pandas._libs.tslib import iNaT
 
 from pandas.core.dtypes.generic import (
     ABCSeries, ABCIndexClass, ABCCategoricalIndex)
@@ -364,10 +365,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
         self._dtype = self._dtype.update_dtype(dtype)
         self._codes = coerce_indexer_dtype(codes, dtype.categories)
 
-    @classmethod
-    def _constructor_from_sequence(cls, scalars):
-        return cls(scalars)
-
     @property
     def categories(self):
         """The categories of this categorical.
@@ -425,6 +422,10 @@ def _ndarray_values(self):
     def _constructor(self):
         return Categorical
 
+    @classmethod
+    def _constructor_from_sequence(cls, scalars):
+        return Categorical(scalars)
+
     def copy(self):
         """ Copy constructor. """
         return self._constructor(values=self._codes.copy(),
@@ -2072,6 +2073,60 @@ def unique(self):
             take_codes = sorted(take_codes)
         return cat.set_categories(cat.categories.take(take_codes))
 
+    def factorize(self, na_sentinel=-1):
+        """Encode the Categorical as an enumerated type.
+
+        Parameters
+        ----------
+        sort : boolean, default False
+            Sort by values
+        na_sentinel: int, default -1
+            Value to mark "not found"
+
+        Returns
+        -------
+        labels : ndarray
+            An integer NumPy array that's an indexer into the original
+            Categorical
+        uniques : Categorical
+            A Categorical whose values are the unique values and
+            whose dtype matches the original CategoricalDtype. Note that if
+            there any unobserved categories in ``self`` will not be present
+            in ``uniques.values``. They will be present in
+            ``uniques.categories``
+
+        Examples
+        --------
+        >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
+        >>> labels, uniques = cat.factorize()
+        >>> labels
+        (array([0, 0, 1]),
+        >>> uniques
+        [a, c]
+        Categories (3, object): [a, b, c])
+
+        Missing values are handled
+
+        >>> labels, uniques = pd.factorize(pd.Categorical(['a', 'b', None]))
+        >>> labels
+        array([ 0,  1, -1])
+        >>> uniques
+        [a, b]
+        Categories (2, object): [a, b]
+        """
+        from pandas.core.algorithms import _factorize_array
+
+        codes = self.codes.astype('int64')
+        codes[codes == -1] = iNaT
+        # We set missing codes, normally -1, to iNaT so that the
+        # Int64HashTable treats them as missing values.
+        labels, uniques = _factorize_array(codes, check_nulls=True,
+                                           na_sentinel=na_sentinel)
+        uniques = self._constructor(self.categories.take(uniques),
+                                    categories=self.categories,
+                                    ordered=self.ordered)
+        return labels, uniques
+
     def equals(self, other):
         """
         Returns True if categorical arrays are equal.
diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py
@@ -0,0 +1,49 @@
+import pytest
+import numpy as np
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+@pytest.mark.parametrize('ordered', [True, False])
+@pytest.mark.parametrize('categories', [
+    ['b', 'a', 'c'],
+    ['a', 'b', 'c', 'd'],
+])
+def test_factorize(categories, ordered):
+    cat = pd.Categorical(['b', 'b', 'a', 'c', None],
+                         categories=categories,
+                         ordered=ordered)
+    labels, uniques = pd.factorize(cat)
+    expected_labels = np.array([0, 0, 1, 2, -1], dtype='int64')
+    expected_uniques = pd.Categorical(['b', 'a', 'c'],
+                                      categories=categories,
+                                      ordered=ordered)
+
+    tm.assert_numpy_array_equal(labels, expected_labels)
+    tm.assert_categorical_equal(uniques, expected_uniques)
+
+
+def test_factorized_sort():
+    cat = pd.Categorical(['b', 'b', None, 'a'])
+    labels, uniques = pd.factorize(cat, sort=True)
+    expected_labels = np.array([1, 1, -1, 0], dtype='int64')
+    expected_uniques = pd.Categorical(['a', 'b'])
+
+    tm.assert_numpy_array_equal(labels, expected_labels)
+    tm.assert_categorical_equal(uniques, expected_uniques)
+
+
+def test_factorized_sort_ordered():
+    cat = pd.Categorical(['b', 'b', None, 'a'],
+                         categories=['c', 'b', 'a'],
+                         ordered=True)
+
+    labels, uniques = pd.factorize(cat, sort=True)
+    expected_labels = np.array([0, 0, -1, 1], dtype='int64')
+    expected_uniques = pd.Categorical(['b', 'a'],
+                                      categories=['c', 'b', 'a'],
+                                      ordered=True)
+
+    tm.assert_numpy_array_equal(labels, expected_labels)
+    tm.assert_categorical_equal(uniques, expected_uniques)