pandas-dev · jreback · Mar 27, 2018 · Feb 19, 2018 · Mar 2, 2018 · Mar 2, 2018
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -4,6 +4,8 @@
 """
 from __future__ import division
 from warnings import warn, catch_warnings
+from textwrap import dedent
+
 import numpy as np
 
 from pandas.core.dtypes.cast import (
@@ -34,7 +36,10 @@
 from pandas.core import common as com
 from pandas._libs import algos, lib, hashtable as htable
 from pandas._libs.tslib import iNaT
-from pandas.util._decorators import deprecate_kwarg
+from pandas.util._decorators import (Appender, Substitution,
+                                     deprecate_kwarg)
+
+_shared_docs = {}
 
 
 # --------------- #
@@ -146,10 +151,9 @@ def _reconstruct_data(values, dtype, original):
     Returns
     -------
     Index for extension types, otherwise ndarray casted to dtype
-
     """
     from pandas import Index
-    if is_categorical_dtype(dtype):
+    if is_extension_array_dtype(dtype):
         pass
     elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
         values = Index(original)._shallow_copy(values, name=None)
@@ -469,32 +473,124 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
     return labels, uniques
 
 
-@deprecate_kwarg(old_arg_name='order', new_arg_name=None)
-def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
-    """
-    Encode input values as an enumerated type or categorical variable
+_shared_docs['factorize'] = """
+    Encode the object as an enumerated type or categorical variable.
+
+    This method is useful for obtaining a numeric representation of an
+    array when all that matters is identifying distinct values. `factorize`
+    is available as both a top-level function :func:`pandas.factorize`,
+    and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
 
     Parameters
     ----------
-    values : Sequence
-        ndarrays must be 1-D. Sequences that aren't pandas objects are
-        coereced to ndarrays before factorization.
-    sort : boolean, default False
-        Sort by values
+    %(values)s%(sort)s%(order)s
     na_sentinel : int, default -1
-        Value to mark "not found"
-    size_hint : hint to the hashtable sizer
+        Value to mark "not found".
+    %(size_hint)s\
 
     Returns
     -------
-    labels : the indexer to the original array
-    uniques : ndarray (1-d) or Index
-        the unique values. Index is returned when passed values is Index or
-        Series
+    labels : ndarray
+        An integer ndarray that's an indexer into `uniques`.
+        ``uniques.take(labels)`` will have the same values as `values`.
+    uniques : ndarray, Index, or Categorical
+        The unique valid values. When `values` is Categorical, `uniques`
+        is a Categorical. When `values` is some other pandas object, an
+        `Index` is returned. Otherwise, a 1-D ndarray is returned.
+
+        .. note ::
+
+           Even if there's a missing value in `values`, `uniques` will
+           *not* contain an entry for it.
+
+    See Also
+    --------
+    pandas.cut : Discretize continuous-valued array.
+    pandas.unique : Find the unique valuse in an array.
+
+    Examples
+    --------
+    These examples all show factorize as a top-level method like
+    ``pd.factorize(values)``. The results are identical for methods like
+    :meth:`Series.factorize`.
+
+    >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
+    >>> labels
+    array([0, 0, 1, 2, 0])
+    >>> uniques
+    array(['b', 'a', 'c'], dtype=object)
+
+    With ``sort=True``, the `uniques` will be sorted, and `labels` will be
+    shuffled so that the relationship is the maintained.
+
+    >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
+    >>> labels
+    array([1, 1, 0, 2, 1])
+    >>> uniques
+    array(['a', 'b', 'c'], dtype=object)
+
+    Missing values are indicated in `labels` with `na_sentinel`
+    (``-1`` by default). Note that missing values are never
+    included in `uniques`.
+
+    >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
+    >>> labels
+    array([ 0, -1,  1,  2,  0])
+    >>> uniques
+    array(['b', 'a', 'c'], dtype=object)
 
-    note: an array of Periods will ignore sort as it returns an always sorted
-    PeriodIndex.
+    Thus far, we've only factorized lists (which are internally coerced to
+    NumPy arrays). When factorizing pandas objects, the type of `uniques`
+    will differ. For Categoricals, a `Categorical` is returned.
+
+    >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
+    >>> labels, uniques = pd.factorize(cat)
+    >>> labels
+    array([0, 0, 1])
+    >>> uniques
+    [a, c]
+    Categories (3, object): [a, b, c]
+
+    Notice that ``'b'`` is in ``uniques.categories``, desipite not being
+    present in ``cat.values``.
+
+    For all other pandas objects, an Index of the appropriate type is
+    returned.
+
+    >>> cat = pd.Series(['a', 'a', 'c'])
+    >>> labels, uniques = pd.factorize(cat)
+    >>> labels
+    array([0, 0, 1])
+    >>> uniques
+    Index(['a', 'c'], dtype='object')
     """
+
+
+@Substitution(
+    values=dedent("""\
+    values : sequence
+        A 1-D seqeunce. Sequences that aren't pandas objects are
+        coereced to ndarrays before factorization.
+    """),
+    order=dedent("""\
+    order
+        .. deprecated:: 0.23.0
+
+           This parameter has no effect and is deprecated.
+    """),
+    sort=dedent("""\
+    sort : bool, default False
+        Sort `uniques` and shuffle `labels` to maintain the
+        relationship.
+    """),
+    size_hint=dedent("""\
+    size_hint : int, optional
+        Hint to the hashtable sizer.
+    """),
+)
+@Appender(_shared_docs['factorize'])
+@deprecate_kwarg(old_arg_name='order', new_arg_name=None)
+def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     # Implementation notes: This method is responsible for 3 things
     # 1.) coercing data to array-like (ndarray, Index, extension array)
     # 2.) factorizing labels and uniques
@@ -507,9 +603,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     values = _ensure_arraylike(values)
     original = values
 
-    if is_categorical_dtype(values):
+    if is_extension_array_dtype(values):
         values = getattr(values, '_values', values)
-        labels, uniques = values.factorize()
+        labels, uniques = values.factorize(na_sentinel=na_sentinel)
         dtype = original.dtype
     else:
         values, dtype, _ = _ensure_data(values)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -77,6 +77,24 @@ def _constructor_from_sequence(cls, scalars):
         """
         raise AbstractMethodError(cls)
 
+    @classmethod
+    def _from_factorized(cls, values, original):
+        """Reconstruct an ExtensionArray after factorization.
+
+        Parameters
+        ----------
+        values : ndarray
+            An integer ndarray with the factorized values.
+        original : ExtensionArray
+            The original ExtensionArray that factorize was called on.
+
+        See Also
+        --------
+        pandas.factorize
+        ExtensionArray.factorize
+        """
+        raise AbstractMethodError(cls)
+
     # ------------------------------------------------------------------------
     # Must be a Sequence
     # ------------------------------------------------------------------------
@@ -353,6 +371,73 @@ def unique(self):
         uniques = unique(self.astype(object))
         return self._constructor_from_sequence(uniques)
 
+    def _values_for_factorize(self):
+        # type: () -> Tuple[ndarray, Any]
+        """Return an array and missing value suitable for factorization.
+
+        Returns
+        -------
+        values : ndarray
+            An array suitable for factoraization. This should maintain order
+            and be a supported dtype (Float64, Int64, UInt64, String, Object).
+            By default, the extension array is cast to object dtype.
+        na_value : object
+            The value in `values` to consider missing. This will be treated
+            as NA in the factorization routines, so it will be coded as
+            `na_sentinal` and not included in `uniques`. By default,
+            ``np.nan`` is used.
+        """
+        return self.astype(object), np.nan
+
+    def factorize(self, na_sentinel=-1):
+        # type: (int) -> Tuple[ndarray, ExtensionArray]
+        """Encode the extension array as an enumerated type.
+
+        Parameters
+        ----------
+        na_sentinel : int, default -1
+            Value to use in the `labels` array to indicate missing values.
+
+        Returns
+        -------
+        labels : ndarray
+            An interger NumPy array that's an indexer into the original
+            ExtensionArray.
+        uniques : ExtensionArray
+            An ExtensionArray containing the unique values of `self`.
+
+            .. note::
+
+               uniques will *not* contain an entry for the NA value of
+               the ExtensionArray if there are any missing values present
+               in `self`.
+
+        See Also
+        --------
+        pandas.factorize : Top-level factorize method that dispatches here.
+
+        Notes
+        -----
+        :meth:`pandas.factorize` offers a `sort` keyword as well.
+        """
+        # Impelmentor note: There are two ways to override the behavior of
+        # pandas.factorize
+        # 1. _values_for_factorize and _from_factorize.
+        #    Specify the values passed to pandas' internal factorization
+        #    routines, and how to convert from those values back to the
+        #    original ExtensionArray.
+        # 2. ExtensionArray.factorize.
+        #    Complete control over factorization.
+        from pandas.core.algorithms import _factorize_array
+
+        arr, na_value = self._values_for_factorize()
+
+        labels, uniques = _factorize_array(arr, na_sentinel=na_sentinel,
+                                           na_value=na_value)
+
+        uniques = self._from_factorized(uniques, self)
+        return labels, uniques
+
     # ------------------------------------------------------------------------
     # Indexing methods
     # ------------------------------------------------------------------------

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2118,58 +2118,15 @@ def unique(self):
             take_codes = sorted(take_codes)
         return cat.set_categories(cat.categories.take(take_codes))
 
-    def factorize(self, na_sentinel=-1):
-        """Encode the Categorical as an enumerated type.
-
-        Parameters
-        ----------
-        sort : boolean, default False
-            Sort by values
-        na_sentinel: int, default -1
-            Value to mark "not found"
-
-        Returns
-        -------
-        labels : ndarray
-            An integer NumPy array that's an indexer into the original
-            Categorical
-        uniques : Categorical
-            A Categorical whose values are the unique values and
-            whose dtype matches the original CategoricalDtype. Note that if
-            there any unobserved categories in ``self`` will not be present
-            in ``uniques.values``. They will be present in
-            ``uniques.categories``
-
-        Examples
-        --------
-        >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
-        >>> labels, uniques = cat.factorize()
-        >>> labels
-        (array([0, 0, 1]),
-        >>> uniques
-        [a, c]
-        Categories (3, object): [a, b, c])
-
-        Missing values are handled
-
-        >>> labels, uniques = pd.factorize(pd.Categorical(['a', 'b', None]))
-        >>> labels
-        array([ 0,  1, -1])
-        >>> uniques
-        [a, b]
-        Categories (2, object): [a, b]
-        """
-        from pandas.core.algorithms import _factorize_array
-
+    def _values_for_factorize(self):
         codes = self.codes.astype('int64')
-        # We set missing codes, normally -1, to iNaT so that the
-        # Int64HashTable treats them as missing values.
-        labels, uniques = _factorize_array(codes, na_sentinel=na_sentinel,
-                                           na_value=-1)
-        uniques = self._constructor(self.categories.take(uniques),
-                                    categories=self.categories,
-                                    ordered=self.ordered)
-        return labels, uniques
+        return codes, -1
+
+    @classmethod
+    def _from_factorized(cls, uniques, original):
+        return original._constructor(original.categories.take(uniques),
+                                     categories=original.categories,
+                                     ordered=original.ordered)
 
     def equals(self, other):
         """

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -2,6 +2,7 @@
 Base and utility classes for pandas objects.
 """
 import warnings
+import textwrap
 from pandas import compat
 from pandas.compat import builtins
 import numpy as np
@@ -1151,24 +1152,16 @@ def memory_usage(self, deep=False):
             v += lib.memory_usage_of_objects(self.values)
         return v
 
+    @Substitution(
+        values='', order='', size_hint='',
+        sort=textwrap.dedent("""\
+            sort : boolean, default False
+                Sort `uniques` and shuffle `labels` to maintain the
+                relationship.
+            """))
+    @Appender(algorithms._shared_docs['factorize'])
     def factorize(self, sort=False, na_sentinel=-1):
-        """
-        Encode the object as an enumerated type or categorical variable
-
-        Parameters
-        ----------
-        sort : boolean, default False
-            Sort by values
-        na_sentinel: int, default -1
-            Value to mark "not found"
-
-        Returns
-        -------
-        labels : the indexer to the original array
-        uniques : the unique Index
-        """
-        from pandas.core.algorithms import factorize
-        return factorize(self, sort=sort, na_sentinel=na_sentinel)
+        return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
 
     _shared_docs['searchsorted'] = (
         """Find indices where elements should be inserted to maintain order.