Skip to content

Commit 38afa93

Browse files
TomAugspurgerjreback
authored andcommitted
REF/BUG/API: factorizing categorical data (#19938)
1 parent 0f2dfbe commit 38afa93

File tree

4 files changed

+171
-18
lines changed

4 files changed

+171
-18
lines changed

doc/source/whatsnew/v0.23.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,8 @@ Categorical
843843
- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`)
844844
- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`)
845845
- Bug in :class:`Series` constructor with scalar and ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19565`)
846+
- Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`)
847+
- Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`)
846848

847849
Datetimelike
848850
^^^^^^^^^^^^

pandas/core/algorithms.py

+61-14
Original file line numberDiff line numberDiff line change
@@ -435,15 +435,45 @@ def isin(comps, values):
435435
return f(comps, values)
436436

437437

438+
def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None):
439+
"""Factorize an array-like to labels and uniques.
440+
441+
This doesn't do any coercion of types or unboxing before factorization.
442+
443+
Parameters
444+
----------
445+
values : ndarray
446+
check_nulls : bool
447+
Whether to check for nulls in the hashtable's 'get_labels' method.
448+
na_sentinel : int, default -1
449+
size_hint : int, optional
450+
Passsed through to the hashtable's 'get_labels' method
451+
452+
Returns
453+
-------
454+
labels, uniques : ndarray
455+
"""
456+
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
457+
458+
table = hash_klass(size_hint or len(values))
459+
uniques = vec_klass()
460+
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
461+
462+
labels = _ensure_platform_int(labels)
463+
uniques = uniques.to_array()
464+
return labels, uniques
465+
466+
438467
@deprecate_kwarg(old_arg_name='order', new_arg_name=None)
439468
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
440469
"""
441470
Encode input values as an enumerated type or categorical variable
442471
443472
Parameters
444473
----------
445-
values : ndarray (1-d)
446-
Sequence
474+
values : Sequence
475+
ndarrays must be 1-D. Sequences that aren't pandas objects are
476+
coereced to ndarrays before factorization.
447477
sort : boolean, default False
448478
Sort by values
449479
na_sentinel : int, default -1
@@ -458,26 +488,43 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
458488
Series
459489
460490
note: an array of Periods will ignore sort as it returns an always sorted
461-
PeriodIndex
491+
PeriodIndex.
462492
"""
493+
# Implementation notes: This method is responsible for 3 things
494+
# 1.) coercing data to array-like (ndarray, Index, extension array)
495+
# 2.) factorizing labels and uniques
496+
# 3.) Maybe boxing the output in an Index
497+
#
498+
# Step 2 is dispatched to extension types (like Categorical). They are
499+
# responsible only for factorization. All data coercion, sorting and boxing
500+
# should happen here.
463501

464502
values = _ensure_arraylike(values)
465503
original = values
466-
values, dtype, _ = _ensure_data(values)
467-
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
468-
469-
table = hash_klass(size_hint or len(values))
470-
uniques = vec_klass()
471-
check_nulls = not is_integer_dtype(original)
472-
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
473504

474-
labels = _ensure_platform_int(labels)
475-
uniques = uniques.to_array()
505+
if is_categorical_dtype(values):
506+
values = getattr(values, '_values', values)
507+
labels, uniques = values.factorize()
508+
dtype = original.dtype
509+
else:
510+
values, dtype, _ = _ensure_data(values)
511+
check_nulls = not is_integer_dtype(original)
512+
labels, uniques = _factorize_array(values, check_nulls,
513+
na_sentinel=na_sentinel,
514+
size_hint=size_hint)
476515

477516
if sort and len(uniques) > 0:
478517
from pandas.core.sorting import safe_sort
479-
uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
480-
assume_unique=True)
518+
try:
519+
order = uniques.argsort()
520+
order2 = order.argsort()
521+
labels = take_1d(order2, labels, fill_value=na_sentinel)
522+
uniques = uniques.take(order)
523+
except TypeError:
524+
# Mixed types, where uniques.argsort fails.
525+
uniques, labels = safe_sort(uniques, labels,
526+
na_sentinel=na_sentinel,
527+
assume_unique=True)
481528

482529
uniques = _reconstruct_data(uniques, dtype, original)
483530

pandas/core/arrays/categorical.py

+59-4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pandas import compat
88
from pandas.compat import u, lzip
99
from pandas._libs import lib, algos as libalgos
10+
from pandas._libs.tslib import iNaT
1011

1112
from pandas.core.dtypes.generic import (
1213
ABCSeries, ABCIndexClass, ABCCategoricalIndex)
@@ -364,10 +365,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
364365
self._dtype = self._dtype.update_dtype(dtype)
365366
self._codes = coerce_indexer_dtype(codes, dtype.categories)
366367

367-
@classmethod
368-
def _constructor_from_sequence(cls, scalars):
369-
return cls(scalars)
370-
371368
@property
372369
def categories(self):
373370
"""The categories of this categorical.
@@ -425,6 +422,10 @@ def _ndarray_values(self):
425422
def _constructor(self):
426423
return Categorical
427424

425+
@classmethod
426+
def _constructor_from_sequence(cls, scalars):
427+
return Categorical(scalars)
428+
428429
def copy(self):
429430
""" Copy constructor. """
430431
return self._constructor(values=self._codes.copy(),
@@ -2072,6 +2073,60 @@ def unique(self):
20722073
take_codes = sorted(take_codes)
20732074
return cat.set_categories(cat.categories.take(take_codes))
20742075

2076+
def factorize(self, na_sentinel=-1):
2077+
"""Encode the Categorical as an enumerated type.
2078+
2079+
Parameters
2080+
----------
2081+
sort : boolean, default False
2082+
Sort by values
2083+
na_sentinel: int, default -1
2084+
Value to mark "not found"
2085+
2086+
Returns
2087+
-------
2088+
labels : ndarray
2089+
An integer NumPy array that's an indexer into the original
2090+
Categorical
2091+
uniques : Categorical
2092+
A Categorical whose values are the unique values and
2093+
whose dtype matches the original CategoricalDtype. Note that if
2094+
there any unobserved categories in ``self`` will not be present
2095+
in ``uniques.values``. They will be present in
2096+
``uniques.categories``
2097+
2098+
Examples
2099+
--------
2100+
>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
2101+
>>> labels, uniques = cat.factorize()
2102+
>>> labels
2103+
(array([0, 0, 1]),
2104+
>>> uniques
2105+
[a, c]
2106+
Categories (3, object): [a, b, c])
2107+
2108+
Missing values are handled
2109+
2110+
>>> labels, uniques = pd.factorize(pd.Categorical(['a', 'b', None]))
2111+
>>> labels
2112+
array([ 0, 1, -1])
2113+
>>> uniques
2114+
[a, b]
2115+
Categories (2, object): [a, b]
2116+
"""
2117+
from pandas.core.algorithms import _factorize_array
2118+
2119+
codes = self.codes.astype('int64')
2120+
codes[codes == -1] = iNaT
2121+
# We set missing codes, normally -1, to iNaT so that the
2122+
# Int64HashTable treats them as missing values.
2123+
labels, uniques = _factorize_array(codes, check_nulls=True,
2124+
na_sentinel=na_sentinel)
2125+
uniques = self._constructor(self.categories.take(uniques),
2126+
categories=self.categories,
2127+
ordered=self.ordered)
2128+
return labels, uniques
2129+
20752130
def equals(self, other):
20762131
"""
20772132
Returns True if categorical arrays are equal.
+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import pytest
2+
import numpy as np
3+
4+
import pandas as pd
5+
import pandas.util.testing as tm
6+
7+
8+
@pytest.mark.parametrize('ordered', [True, False])
9+
@pytest.mark.parametrize('categories', [
10+
['b', 'a', 'c'],
11+
['a', 'b', 'c', 'd'],
12+
])
13+
def test_factorize(categories, ordered):
14+
cat = pd.Categorical(['b', 'b', 'a', 'c', None],
15+
categories=categories,
16+
ordered=ordered)
17+
labels, uniques = pd.factorize(cat)
18+
expected_labels = np.array([0, 0, 1, 2, -1], dtype='int64')
19+
expected_uniques = pd.Categorical(['b', 'a', 'c'],
20+
categories=categories,
21+
ordered=ordered)
22+
23+
tm.assert_numpy_array_equal(labels, expected_labels)
24+
tm.assert_categorical_equal(uniques, expected_uniques)
25+
26+
27+
def test_factorized_sort():
28+
cat = pd.Categorical(['b', 'b', None, 'a'])
29+
labels, uniques = pd.factorize(cat, sort=True)
30+
expected_labels = np.array([1, 1, -1, 0], dtype='int64')
31+
expected_uniques = pd.Categorical(['a', 'b'])
32+
33+
tm.assert_numpy_array_equal(labels, expected_labels)
34+
tm.assert_categorical_equal(uniques, expected_uniques)
35+
36+
37+
def test_factorized_sort_ordered():
38+
cat = pd.Categorical(['b', 'b', None, 'a'],
39+
categories=['c', 'b', 'a'],
40+
ordered=True)
41+
42+
labels, uniques = pd.factorize(cat, sort=True)
43+
expected_labels = np.array([0, 0, -1, 1], dtype='int64')
44+
expected_uniques = pd.Categorical(['b', 'a'],
45+
categories=['c', 'b', 'a'],
46+
ordered=True)
47+
48+
tm.assert_numpy_array_equal(labels, expected_labels)
49+
tm.assert_categorical_equal(uniques, expected_uniques)

0 commit comments

Comments
 (0)