Skip to content

Commit b0143e5

Browse files
TomAugspurgerjavadnoorb
authored andcommitted
ENH/API: ExtensionArray.factorize (pandas-dev#20361)
1 parent 15b7138 commit b0143e5

File tree

11 files changed

+290
-90
lines changed

11 files changed

+290
-90
lines changed

pandas/core/algorithms.py

+118-22
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
"""
55
from __future__ import division
66
from warnings import warn, catch_warnings
7+
from textwrap import dedent
8+
79
import numpy as np
810

911
from pandas.core.dtypes.cast import (
@@ -34,7 +36,10 @@
3436
from pandas.core import common as com
3537
from pandas._libs import algos, lib, hashtable as htable
3638
from pandas._libs.tslib import iNaT
37-
from pandas.util._decorators import deprecate_kwarg
39+
from pandas.util._decorators import (Appender, Substitution,
40+
deprecate_kwarg)
41+
42+
_shared_docs = {}
3843

3944

4045
# --------------- #
@@ -146,10 +151,9 @@ def _reconstruct_data(values, dtype, original):
146151
Returns
147152
-------
148153
Index for extension types, otherwise ndarray casted to dtype
149-
150154
"""
151155
from pandas import Index
152-
if is_categorical_dtype(dtype):
156+
if is_extension_array_dtype(dtype):
153157
pass
154158
elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
155159
values = Index(original)._shallow_copy(values, name=None)
@@ -469,32 +473,124 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
469473
return labels, uniques
470474

471475

472-
@deprecate_kwarg(old_arg_name='order', new_arg_name=None)
473-
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
474-
"""
475-
Encode input values as an enumerated type or categorical variable
476+
_shared_docs['factorize'] = """
477+
Encode the object as an enumerated type or categorical variable.
478+
479+
This method is useful for obtaining a numeric representation of an
480+
array when all that matters is identifying distinct values. `factorize`
481+
is available as both a top-level function :func:`pandas.factorize`,
482+
and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
476483
477484
Parameters
478485
----------
479-
values : Sequence
480-
ndarrays must be 1-D. Sequences that aren't pandas objects are
481-
coereced to ndarrays before factorization.
482-
sort : boolean, default False
483-
Sort by values
486+
%(values)s%(sort)s%(order)s
484487
na_sentinel : int, default -1
485-
Value to mark "not found"
486-
size_hint : hint to the hashtable sizer
488+
Value to mark "not found".
489+
%(size_hint)s\
487490
488491
Returns
489492
-------
490-
labels : the indexer to the original array
491-
uniques : ndarray (1-d) or Index
492-
the unique values. Index is returned when passed values is Index or
493-
Series
493+
labels : ndarray
494+
An integer ndarray that's an indexer into `uniques`.
495+
``uniques.take(labels)`` will have the same values as `values`.
496+
uniques : ndarray, Index, or Categorical
497+
The unique valid values. When `values` is Categorical, `uniques`
498+
is a Categorical. When `values` is some other pandas object, an
499+
`Index` is returned. Otherwise, a 1-D ndarray is returned.
500+
501+
.. note ::
502+
503+
Even if there's a missing value in `values`, `uniques` will
504+
*not* contain an entry for it.
505+
506+
See Also
507+
--------
508+
pandas.cut : Discretize continuous-valued array.
509+
pandas.unique : Find the unique valuse in an array.
510+
511+
Examples
512+
--------
513+
These examples all show factorize as a top-level method like
514+
``pd.factorize(values)``. The results are identical for methods like
515+
:meth:`Series.factorize`.
516+
517+
>>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
518+
>>> labels
519+
array([0, 0, 1, 2, 0])
520+
>>> uniques
521+
array(['b', 'a', 'c'], dtype=object)
522+
523+
With ``sort=True``, the `uniques` will be sorted, and `labels` will be
524+
shuffled so that the relationship is the maintained.
525+
526+
>>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
527+
>>> labels
528+
array([1, 1, 0, 2, 1])
529+
>>> uniques
530+
array(['a', 'b', 'c'], dtype=object)
531+
532+
Missing values are indicated in `labels` with `na_sentinel`
533+
(``-1`` by default). Note that missing values are never
534+
included in `uniques`.
535+
536+
>>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
537+
>>> labels
538+
array([ 0, -1, 1, 2, 0])
539+
>>> uniques
540+
array(['b', 'a', 'c'], dtype=object)
494541
495-
note: an array of Periods will ignore sort as it returns an always sorted
496-
PeriodIndex.
542+
Thus far, we've only factorized lists (which are internally coerced to
543+
NumPy arrays). When factorizing pandas objects, the type of `uniques`
544+
will differ. For Categoricals, a `Categorical` is returned.
545+
546+
>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
547+
>>> labels, uniques = pd.factorize(cat)
548+
>>> labels
549+
array([0, 0, 1])
550+
>>> uniques
551+
[a, c]
552+
Categories (3, object): [a, b, c]
553+
554+
Notice that ``'b'`` is in ``uniques.categories``, desipite not being
555+
present in ``cat.values``.
556+
557+
For all other pandas objects, an Index of the appropriate type is
558+
returned.
559+
560+
>>> cat = pd.Series(['a', 'a', 'c'])
561+
>>> labels, uniques = pd.factorize(cat)
562+
>>> labels
563+
array([0, 0, 1])
564+
>>> uniques
565+
Index(['a', 'c'], dtype='object')
497566
"""
567+
568+
569+
@Substitution(
570+
values=dedent("""\
571+
values : sequence
572+
A 1-D seqeunce. Sequences that aren't pandas objects are
573+
coereced to ndarrays before factorization.
574+
"""),
575+
order=dedent("""\
576+
order
577+
.. deprecated:: 0.23.0
578+
579+
This parameter has no effect and is deprecated.
580+
"""),
581+
sort=dedent("""\
582+
sort : bool, default False
583+
Sort `uniques` and shuffle `labels` to maintain the
584+
relationship.
585+
"""),
586+
size_hint=dedent("""\
587+
size_hint : int, optional
588+
Hint to the hashtable sizer.
589+
"""),
590+
)
591+
@Appender(_shared_docs['factorize'])
592+
@deprecate_kwarg(old_arg_name='order', new_arg_name=None)
593+
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
498594
# Implementation notes: This method is responsible for 3 things
499595
# 1.) coercing data to array-like (ndarray, Index, extension array)
500596
# 2.) factorizing labels and uniques
@@ -507,9 +603,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
507603
values = _ensure_arraylike(values)
508604
original = values
509605

510-
if is_categorical_dtype(values):
606+
if is_extension_array_dtype(values):
511607
values = getattr(values, '_values', values)
512-
labels, uniques = values.factorize()
608+
labels, uniques = values.factorize(na_sentinel=na_sentinel)
513609
dtype = original.dtype
514610
else:
515611
values, dtype, _ = _ensure_data(values)

pandas/core/arrays/base.py

+85
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,24 @@ def _constructor_from_sequence(cls, scalars):
7777
"""
7878
raise AbstractMethodError(cls)
7979

80+
@classmethod
81+
def _from_factorized(cls, values, original):
82+
"""Reconstruct an ExtensionArray after factorization.
83+
84+
Parameters
85+
----------
86+
values : ndarray
87+
An integer ndarray with the factorized values.
88+
original : ExtensionArray
89+
The original ExtensionArray that factorize was called on.
90+
91+
See Also
92+
--------
93+
pandas.factorize
94+
ExtensionArray.factorize
95+
"""
96+
raise AbstractMethodError(cls)
97+
8098
# ------------------------------------------------------------------------
8199
# Must be a Sequence
82100
# ------------------------------------------------------------------------
@@ -353,6 +371,73 @@ def unique(self):
353371
uniques = unique(self.astype(object))
354372
return self._constructor_from_sequence(uniques)
355373

374+
def _values_for_factorize(self):
375+
# type: () -> Tuple[ndarray, Any]
376+
"""Return an array and missing value suitable for factorization.
377+
378+
Returns
379+
-------
380+
values : ndarray
381+
An array suitable for factoraization. This should maintain order
382+
and be a supported dtype (Float64, Int64, UInt64, String, Object).
383+
By default, the extension array is cast to object dtype.
384+
na_value : object
385+
The value in `values` to consider missing. This will be treated
386+
as NA in the factorization routines, so it will be coded as
387+
`na_sentinal` and not included in `uniques`. By default,
388+
``np.nan`` is used.
389+
"""
390+
return self.astype(object), np.nan
391+
392+
def factorize(self, na_sentinel=-1):
393+
# type: (int) -> Tuple[ndarray, ExtensionArray]
394+
"""Encode the extension array as an enumerated type.
395+
396+
Parameters
397+
----------
398+
na_sentinel : int, default -1
399+
Value to use in the `labels` array to indicate missing values.
400+
401+
Returns
402+
-------
403+
labels : ndarray
404+
An interger NumPy array that's an indexer into the original
405+
ExtensionArray.
406+
uniques : ExtensionArray
407+
An ExtensionArray containing the unique values of `self`.
408+
409+
.. note::
410+
411+
uniques will *not* contain an entry for the NA value of
412+
the ExtensionArray if there are any missing values present
413+
in `self`.
414+
415+
See Also
416+
--------
417+
pandas.factorize : Top-level factorize method that dispatches here.
418+
419+
Notes
420+
-----
421+
:meth:`pandas.factorize` offers a `sort` keyword as well.
422+
"""
423+
# Impelmentor note: There are two ways to override the behavior of
424+
# pandas.factorize
425+
# 1. _values_for_factorize and _from_factorize.
426+
# Specify the values passed to pandas' internal factorization
427+
# routines, and how to convert from those values back to the
428+
# original ExtensionArray.
429+
# 2. ExtensionArray.factorize.
430+
# Complete control over factorization.
431+
from pandas.core.algorithms import _factorize_array
432+
433+
arr, na_value = self._values_for_factorize()
434+
435+
labels, uniques = _factorize_array(arr, na_sentinel=na_sentinel,
436+
na_value=na_value)
437+
438+
uniques = self._from_factorized(uniques, self)
439+
return labels, uniques
440+
356441
# ------------------------------------------------------------------------
357442
# Indexing methods
358443
# ------------------------------------------------------------------------

pandas/core/arrays/categorical.py

+8-51
Original file line numberDiff line numberDiff line change
@@ -2118,58 +2118,15 @@ def unique(self):
21182118
take_codes = sorted(take_codes)
21192119
return cat.set_categories(cat.categories.take(take_codes))
21202120

2121-
def factorize(self, na_sentinel=-1):
2122-
"""Encode the Categorical as an enumerated type.
2123-
2124-
Parameters
2125-
----------
2126-
sort : boolean, default False
2127-
Sort by values
2128-
na_sentinel: int, default -1
2129-
Value to mark "not found"
2130-
2131-
Returns
2132-
-------
2133-
labels : ndarray
2134-
An integer NumPy array that's an indexer into the original
2135-
Categorical
2136-
uniques : Categorical
2137-
A Categorical whose values are the unique values and
2138-
whose dtype matches the original CategoricalDtype. Note that if
2139-
there any unobserved categories in ``self`` will not be present
2140-
in ``uniques.values``. They will be present in
2141-
``uniques.categories``
2142-
2143-
Examples
2144-
--------
2145-
>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
2146-
>>> labels, uniques = cat.factorize()
2147-
>>> labels
2148-
(array([0, 0, 1]),
2149-
>>> uniques
2150-
[a, c]
2151-
Categories (3, object): [a, b, c])
2152-
2153-
Missing values are handled
2154-
2155-
>>> labels, uniques = pd.factorize(pd.Categorical(['a', 'b', None]))
2156-
>>> labels
2157-
array([ 0, 1, -1])
2158-
>>> uniques
2159-
[a, b]
2160-
Categories (2, object): [a, b]
2161-
"""
2162-
from pandas.core.algorithms import _factorize_array
2163-
2121+
def _values_for_factorize(self):
21642122
codes = self.codes.astype('int64')
2165-
# We set missing codes, normally -1, to iNaT so that the
2166-
# Int64HashTable treats them as missing values.
2167-
labels, uniques = _factorize_array(codes, na_sentinel=na_sentinel,
2168-
na_value=-1)
2169-
uniques = self._constructor(self.categories.take(uniques),
2170-
categories=self.categories,
2171-
ordered=self.ordered)
2172-
return labels, uniques
2123+
return codes, -1
2124+
2125+
@classmethod
2126+
def _from_factorized(cls, uniques, original):
2127+
return original._constructor(original.categories.take(uniques),
2128+
categories=original.categories,
2129+
ordered=original.ordered)
21732130

21742131
def equals(self, other):
21752132
"""

pandas/core/base.py

+10-17
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Base and utility classes for pandas objects.
33
"""
44
import warnings
5+
import textwrap
56
from pandas import compat
67
from pandas.compat import builtins
78
import numpy as np
@@ -1151,24 +1152,16 @@ def memory_usage(self, deep=False):
11511152
v += lib.memory_usage_of_objects(self.values)
11521153
return v
11531154

1155+
@Substitution(
1156+
values='', order='', size_hint='',
1157+
sort=textwrap.dedent("""\
1158+
sort : boolean, default False
1159+
Sort `uniques` and shuffle `labels` to maintain the
1160+
relationship.
1161+
"""))
1162+
@Appender(algorithms._shared_docs['factorize'])
11541163
def factorize(self, sort=False, na_sentinel=-1):
1155-
"""
1156-
Encode the object as an enumerated type or categorical variable
1157-
1158-
Parameters
1159-
----------
1160-
sort : boolean, default False
1161-
Sort by values
1162-
na_sentinel: int, default -1
1163-
Value to mark "not found"
1164-
1165-
Returns
1166-
-------
1167-
labels : the indexer to the original array
1168-
uniques : the unique Index
1169-
"""
1170-
from pandas.core.algorithms import factorize
1171-
return factorize(self, sort=sort, na_sentinel=na_sentinel)
1164+
return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
11721165

11731166
_shared_docs['searchsorted'] = (
11741167
"""Find indices where elements should be inserted to maintain order.

0 commit comments

Comments
 (0)