diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 45f86f044a4b2..065a5782aced1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -4,6 +4,8 @@ """ from __future__ import division from warnings import warn, catch_warnings +from textwrap import dedent + import numpy as np from pandas.core.dtypes.cast import ( @@ -34,7 +36,10 @@ from pandas.core import common as com from pandas._libs import algos, lib, hashtable as htable from pandas._libs.tslib import iNaT -from pandas.util._decorators import deprecate_kwarg +from pandas.util._decorators import (Appender, Substitution, + deprecate_kwarg) + +_shared_docs = {} # --------------- # @@ -146,10 +151,9 @@ def _reconstruct_data(values, dtype, original): Returns ------- Index for extension types, otherwise ndarray casted to dtype - """ from pandas import Index - if is_categorical_dtype(dtype): + if is_extension_array_dtype(dtype): pass elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype): values = Index(original)._shallow_copy(values, name=None) @@ -469,32 +473,124 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, return labels, uniques -@deprecate_kwarg(old_arg_name='order', new_arg_name=None) -def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): - """ - Encode input values as an enumerated type or categorical variable +_shared_docs['factorize'] = """ + Encode the object as an enumerated type or categorical variable. + + This method is useful for obtaining a numeric representation of an + array when all that matters is identifying distinct values. `factorize` + is available as both a top-level function :func:`pandas.factorize`, + and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. Parameters ---------- - values : Sequence - ndarrays must be 1-D. Sequences that aren't pandas objects are - coereced to ndarrays before factorization. - sort : boolean, default False - Sort by values + %(values)s%(sort)s%(order)s na_sentinel : int, default -1 - Value to mark "not found" - size_hint : hint to the hashtable sizer + Value to mark "not found". + %(size_hint)s\ Returns ------- - labels : the indexer to the original array - uniques : ndarray (1-d) or Index - the unique values. Index is returned when passed values is Index or - Series + labels : ndarray + An integer ndarray that's an indexer into `uniques`. + ``uniques.take(labels)`` will have the same values as `values`. + uniques : ndarray, Index, or Categorical + The unique valid values. When `values` is Categorical, `uniques` + is a Categorical. When `values` is some other pandas object, an + `Index` is returned. Otherwise, a 1-D ndarray is returned. + + .. note :: + + Even if there's a missing value in `values`, `uniques` will + *not* contain an entry for it. + + See Also + -------- + pandas.cut : Discretize continuous-valued array. + pandas.unique : Find the unique valuse in an array. + + Examples + -------- + These examples all show factorize as a top-level method like + ``pd.factorize(values)``. The results are identical for methods like + :meth:`Series.factorize`. + + >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) + >>> labels + array([0, 0, 1, 2, 0]) + >>> uniques + array(['b', 'a', 'c'], dtype=object) + + With ``sort=True``, the `uniques` will be sorted, and `labels` will be + shuffled so that the relationship is the maintained. + + >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) + >>> labels + array([1, 1, 0, 2, 1]) + >>> uniques + array(['a', 'b', 'c'], dtype=object) + + Missing values are indicated in `labels` with `na_sentinel` + (``-1`` by default). Note that missing values are never + included in `uniques`. + + >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) + >>> labels + array([ 0, -1, 1, 2, 0]) + >>> uniques + array(['b', 'a', 'c'], dtype=object) - note: an array of Periods will ignore sort as it returns an always sorted - PeriodIndex. + Thus far, we've only factorized lists (which are internally coerced to + NumPy arrays). When factorizing pandas objects, the type of `uniques` + will differ. For Categoricals, a `Categorical` is returned. + + >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) + >>> labels, uniques = pd.factorize(cat) + >>> labels + array([0, 0, 1]) + >>> uniques + [a, c] + Categories (3, object): [a, b, c] + + Notice that ``'b'`` is in ``uniques.categories``, desipite not being + present in ``cat.values``. + + For all other pandas objects, an Index of the appropriate type is + returned. + + >>> cat = pd.Series(['a', 'a', 'c']) + >>> labels, uniques = pd.factorize(cat) + >>> labels + array([0, 0, 1]) + >>> uniques + Index(['a', 'c'], dtype='object') """ + + +@Substitution( + values=dedent("""\ + values : sequence + A 1-D seqeunce. Sequences that aren't pandas objects are + coereced to ndarrays before factorization. + """), + order=dedent("""\ + order + .. deprecated:: 0.23.0 + + This parameter has no effect and is deprecated. + """), + sort=dedent("""\ + sort : bool, default False + Sort `uniques` and shuffle `labels` to maintain the + relationship. + """), + size_hint=dedent("""\ + size_hint : int, optional + Hint to the hashtable sizer. + """), +) +@Appender(_shared_docs['factorize']) +@deprecate_kwarg(old_arg_name='order', new_arg_name=None) +def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) # 2.) factorizing labels and uniques @@ -507,9 +603,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): values = _ensure_arraylike(values) original = values - if is_categorical_dtype(values): + if is_extension_array_dtype(values): values = getattr(values, '_values', values) - labels, uniques = values.factorize() + labels, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: values, dtype, _ = _ensure_data(values) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d53caa265b9b3..c281bd80cb274 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -77,6 +77,24 @@ def _constructor_from_sequence(cls, scalars): """ raise AbstractMethodError(cls) + @classmethod + def _from_factorized(cls, values, original): + """Reconstruct an ExtensionArray after factorization. + + Parameters + ---------- + values : ndarray + An integer ndarray with the factorized values. + original : ExtensionArray + The original ExtensionArray that factorize was called on. + + See Also + -------- + pandas.factorize + ExtensionArray.factorize + """ + raise AbstractMethodError(cls) + # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ @@ -353,6 +371,73 @@ def unique(self): uniques = unique(self.astype(object)) return self._constructor_from_sequence(uniques) + def _values_for_factorize(self): + # type: () -> Tuple[ndarray, Any] + """Return an array and missing value suitable for factorization. + + Returns + ------- + values : ndarray + An array suitable for factoraization. This should maintain order + and be a supported dtype (Float64, Int64, UInt64, String, Object). + By default, the extension array is cast to object dtype. + na_value : object + The value in `values` to consider missing. This will be treated + as NA in the factorization routines, so it will be coded as + `na_sentinal` and not included in `uniques`. By default, + ``np.nan`` is used. + """ + return self.astype(object), np.nan + + def factorize(self, na_sentinel=-1): + # type: (int) -> Tuple[ndarray, ExtensionArray] + """Encode the extension array as an enumerated type. + + Parameters + ---------- + na_sentinel : int, default -1 + Value to use in the `labels` array to indicate missing values. + + Returns + ------- + labels : ndarray + An interger NumPy array that's an indexer into the original + ExtensionArray. + uniques : ExtensionArray + An ExtensionArray containing the unique values of `self`. + + .. note:: + + uniques will *not* contain an entry for the NA value of + the ExtensionArray if there are any missing values present + in `self`. + + See Also + -------- + pandas.factorize : Top-level factorize method that dispatches here. + + Notes + ----- + :meth:`pandas.factorize` offers a `sort` keyword as well. + """ + # Impelmentor note: There are two ways to override the behavior of + # pandas.factorize + # 1. _values_for_factorize and _from_factorize. + # Specify the values passed to pandas' internal factorization + # routines, and how to convert from those values back to the + # original ExtensionArray. + # 2. ExtensionArray.factorize. + # Complete control over factorization. + from pandas.core.algorithms import _factorize_array + + arr, na_value = self._values_for_factorize() + + labels, uniques = _factorize_array(arr, na_sentinel=na_sentinel, + na_value=na_value) + + uniques = self._from_factorized(uniques, self) + return labels, uniques + # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ac57660300be4..b5a4785fd98a6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2118,58 +2118,15 @@ def unique(self): take_codes = sorted(take_codes) return cat.set_categories(cat.categories.take(take_codes)) - def factorize(self, na_sentinel=-1): - """Encode the Categorical as an enumerated type. - - Parameters - ---------- - sort : boolean, default False - Sort by values - na_sentinel: int, default -1 - Value to mark "not found" - - Returns - ------- - labels : ndarray - An integer NumPy array that's an indexer into the original - Categorical - uniques : Categorical - A Categorical whose values are the unique values and - whose dtype matches the original CategoricalDtype. Note that if - there any unobserved categories in ``self`` will not be present - in ``uniques.values``. They will be present in - ``uniques.categories`` - - Examples - -------- - >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) - >>> labels, uniques = cat.factorize() - >>> labels - (array([0, 0, 1]), - >>> uniques - [a, c] - Categories (3, object): [a, b, c]) - - Missing values are handled - - >>> labels, uniques = pd.factorize(pd.Categorical(['a', 'b', None])) - >>> labels - array([ 0, 1, -1]) - >>> uniques - [a, b] - Categories (2, object): [a, b] - """ - from pandas.core.algorithms import _factorize_array - + def _values_for_factorize(self): codes = self.codes.astype('int64') - # We set missing codes, normally -1, to iNaT so that the - # Int64HashTable treats them as missing values. - labels, uniques = _factorize_array(codes, na_sentinel=na_sentinel, - na_value=-1) - uniques = self._constructor(self.categories.take(uniques), - categories=self.categories, - ordered=self.ordered) - return labels, uniques + return codes, -1 + + @classmethod + def _from_factorized(cls, uniques, original): + return original._constructor(original.categories.take(uniques), + categories=original.categories, + ordered=original.ordered) def equals(self, other): """ diff --git a/pandas/core/base.py b/pandas/core/base.py index b3eb9a0ae7530..99e2af9fb3aeb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -2,6 +2,7 @@ Base and utility classes for pandas objects. """ import warnings +import textwrap from pandas import compat from pandas.compat import builtins import numpy as np @@ -1151,24 +1152,16 @@ def memory_usage(self, deep=False): v += lib.memory_usage_of_objects(self.values) return v + @Substitution( + values='', order='', size_hint='', + sort=textwrap.dedent("""\ + sort : boolean, default False + Sort `uniques` and shuffle `labels` to maintain the + relationship. + """)) + @Appender(algorithms._shared_docs['factorize']) def factorize(self, sort=False, na_sentinel=-1): - """ - Encode the object as an enumerated type or categorical variable - - Parameters - ---------- - sort : boolean, default False - Sort by values - na_sentinel: int, default -1 - Value to mark "not found" - - Returns - ------- - labels : the indexer to the original array - uniques : the unique Index - """ - from pandas.core.algorithms import factorize - return factorize(self, sort=sort, na_sentinel=na_sentinel) + return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) _shared_docs['searchsorted'] = ( """Find indices where elements should be inserted to maintain order. diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4d467d62d0a56..f9f079cb21858 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +import pandas.util.testing as tm from .base import BaseExtensionTests @@ -82,3 +83,23 @@ def test_unique(self, data, box, method): assert len(result) == 1 assert isinstance(result, type(data)) assert result[0] == duplicated[0] + + @pytest.mark.parametrize('na_sentinel', [-1, -2]) + def test_factorize(self, data_for_grouping, na_sentinel): + labels, uniques = pd.factorize(data_for_grouping, + na_sentinel=na_sentinel) + expected_labels = np.array([0, 0, na_sentinel, + na_sentinel, 1, 1, 0, 2], + dtype='int64') + expected_uniques = data_for_grouping.take([0, 4, 7]) + + tm.assert_numpy_array_equal(labels, expected_labels) + self.assert_extension_array_equal(uniques, expected_uniques) + + @pytest.mark.parametrize('na_sentinel', [-1, -2]) + def test_factorize_equivalence(self, data_for_grouping, na_sentinel): + l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel) + + tm.assert_numpy_array_equal(l1, l2) + self.assert_extension_array_equal(u1, u2) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index b602d9ee78e2a..7528299578326 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -46,6 +46,11 @@ def na_value(): return np.nan +@pytest.fixture +def data_for_grouping(): + return Categorical(['a', 'a', None, None, 'b', 'b', 'a', 'c']) + + class TestDtype(base.BaseDtypeTests): pass diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 04dfb408fc378..4cb4ea21d9be3 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -66,3 +66,14 @@ def na_cmp(): def na_value(): """The scalar missing value for this type. Default 'None'""" return None + + +@pytest.fixture +def data_for_grouping(): + """Data for factorization, grouping, and unique tests. + + Expected to be like [B, B, NA, NA, A, A, B, C] + + Where A < B < C and NA is missing + """ + raise NotImplementedError diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index f1852542088ff..b66a14c77a059 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -36,6 +36,10 @@ def __init__(self, values): def _constructor_from_sequence(cls, scalars): return cls(scalars) + @classmethod + def _from_factorized(cls, values, original): + return cls(values) + def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.values[item] diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index b6303ededd0dc..22c1a67a0d60d 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -49,6 +49,15 @@ def na_value(): return decimal.Decimal("NaN") +@pytest.fixture +def data_for_grouping(): + b = decimal.Decimal('1.0') + a = decimal.Decimal('0.0') + c = decimal.Decimal('2.0') + na = decimal.Decimal('NaN') + return DecimalArray([b, b, na, na, a, a, b, c]) + + class BaseDecimal(object): def assert_series_equal(self, left, right, *args, **kwargs): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index ee0951812b8f0..51a68a3701046 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -37,6 +37,10 @@ def __init__(self, values): def _constructor_from_sequence(cls, scalars): return cls(scalars) + @classmethod + def _from_factorized(cls, values, original): + return cls([collections.UserDict(x) for x in values if x != ()]) + def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.data[item] @@ -108,6 +112,10 @@ def _concat_same_type(cls, to_concat): data = list(itertools.chain.from_iterable([x.data for x in to_concat])) return cls(data) + def _values_for_factorize(self): + frozen = tuple(tuple(x.items()) for x in self) + return np.array(frozen, dtype=object), () + def _values_for_argsort(self): # Disable NumPy's shape inference by including an empty tuple... # If all the elemnts of self are the same size P, NumPy will diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 8083a1ce69092..63d97d5e7a2c5 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -48,6 +48,17 @@ def na_cmp(): return operator.eq +@pytest.fixture +def data_for_grouping(): + return JSONArray([ + {'b': 1}, {'b': 1}, + {}, {}, + {'a': 0, 'c': 2}, {'a': 0, 'c': 2}, + {'b': 1}, + {'c': 2}, + ]) + + class TestDtype(base.BaseDtypeTests): pass