4
4
"""
5
5
from __future__ import division
6
6
from warnings import warn , catch_warnings
7
+ from textwrap import dedent
8
+
7
9
import numpy as np
8
10
9
11
from pandas .core .dtypes .cast import (
34
36
from pandas .core import common as com
35
37
from pandas ._libs import algos , lib , hashtable as htable
36
38
from pandas ._libs .tslib import iNaT
37
- from pandas .util ._decorators import deprecate_kwarg
39
+ from pandas .util ._decorators import (Appender , Substitution ,
40
+ deprecate_kwarg )
41
+
42
+ _shared_docs = {}
38
43
39
44
40
45
# --------------- #
@@ -146,10 +151,9 @@ def _reconstruct_data(values, dtype, original):
146
151
Returns
147
152
-------
148
153
Index for extension types, otherwise ndarray casted to dtype
149
-
150
154
"""
151
155
from pandas import Index
152
- if is_categorical_dtype (dtype ):
156
+ if is_extension_array_dtype (dtype ):
153
157
pass
154
158
elif is_datetime64tz_dtype (dtype ) or is_period_dtype (dtype ):
155
159
values = Index (original )._shallow_copy (values , name = None )
@@ -469,32 +473,124 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
469
473
return labels , uniques
470
474
471
475
472
- @deprecate_kwarg (old_arg_name = 'order' , new_arg_name = None )
473
- def factorize (values , sort = False , order = None , na_sentinel = - 1 , size_hint = None ):
474
- """
475
- Encode input values as an enumerated type or categorical variable
476
+ _shared_docs ['factorize' ] = """
477
+ Encode the object as an enumerated type or categorical variable.
478
+
479
+ This method is useful for obtaining a numeric representation of an
480
+ array when all that matters is identifying distinct values. `factorize`
481
+ is available as both a top-level function :func:`pandas.factorize`,
482
+ and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
476
483
477
484
Parameters
478
485
----------
479
- values : Sequence
480
- ndarrays must be 1-D. Sequences that aren't pandas objects are
481
- coereced to ndarrays before factorization.
482
- sort : boolean, default False
483
- Sort by values
486
+ %(values)s%(sort)s%(order)s
484
487
na_sentinel : int, default -1
485
- Value to mark "not found"
486
- size_hint : hint to the hashtable sizer
488
+ Value to mark "not found".
489
+ %( size_hint)s \
487
490
488
491
Returns
489
492
-------
490
- labels : the indexer to the original array
491
- uniques : ndarray (1-d) or Index
492
- the unique values. Index is returned when passed values is Index or
493
- Series
493
+ labels : ndarray
494
+ An integer ndarray that's an indexer into `uniques`.
495
+ ``uniques.take(labels)`` will have the same values as `values`.
496
+ uniques : ndarray, Index, or Categorical
497
+ The unique valid values. When `values` is Categorical, `uniques`
498
+ is a Categorical. When `values` is some other pandas object, an
499
+ `Index` is returned. Otherwise, a 1-D ndarray is returned.
500
+
501
+ .. note ::
502
+
503
+ Even if there's a missing value in `values`, `uniques` will
504
+ *not* contain an entry for it.
505
+
506
+ See Also
507
+ --------
508
+ pandas.cut : Discretize continuous-valued array.
509
+ pandas.unique : Find the unique valuse in an array.
510
+
511
+ Examples
512
+ --------
513
+ These examples all show factorize as a top-level method like
514
+ ``pd.factorize(values)``. The results are identical for methods like
515
+ :meth:`Series.factorize`.
516
+
517
+ >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
518
+ >>> labels
519
+ array([0, 0, 1, 2, 0])
520
+ >>> uniques
521
+ array(['b', 'a', 'c'], dtype=object)
522
+
523
+ With ``sort=True``, the `uniques` will be sorted, and `labels` will be
524
+ shuffled so that the relationship is the maintained.
525
+
526
+ >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
527
+ >>> labels
528
+ array([1, 1, 0, 2, 1])
529
+ >>> uniques
530
+ array(['a', 'b', 'c'], dtype=object)
531
+
532
+ Missing values are indicated in `labels` with `na_sentinel`
533
+ (``-1`` by default). Note that missing values are never
534
+ included in `uniques`.
535
+
536
+ >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
537
+ >>> labels
538
+ array([ 0, -1, 1, 2, 0])
539
+ >>> uniques
540
+ array(['b', 'a', 'c'], dtype=object)
494
541
495
- note: an array of Periods will ignore sort as it returns an always sorted
496
- PeriodIndex.
542
+ Thus far, we've only factorized lists (which are internally coerced to
543
+ NumPy arrays). When factorizing pandas objects, the type of `uniques`
544
+ will differ. For Categoricals, a `Categorical` is returned.
545
+
546
+ >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
547
+ >>> labels, uniques = pd.factorize(cat)
548
+ >>> labels
549
+ array([0, 0, 1])
550
+ >>> uniques
551
+ [a, c]
552
+ Categories (3, object): [a, b, c]
553
+
554
+ Notice that ``'b'`` is in ``uniques.categories``, desipite not being
555
+ present in ``cat.values``.
556
+
557
+ For all other pandas objects, an Index of the appropriate type is
558
+ returned.
559
+
560
+ >>> cat = pd.Series(['a', 'a', 'c'])
561
+ >>> labels, uniques = pd.factorize(cat)
562
+ >>> labels
563
+ array([0, 0, 1])
564
+ >>> uniques
565
+ Index(['a', 'c'], dtype='object')
497
566
"""
567
+
568
+
569
+ @Substitution (
570
+ values = dedent ("""\
571
+ values : sequence
572
+ A 1-D seqeunce. Sequences that aren't pandas objects are
573
+ coereced to ndarrays before factorization.
574
+ """ ),
575
+ order = dedent ("""\
576
+ order
577
+ .. deprecated:: 0.23.0
578
+
579
+ This parameter has no effect and is deprecated.
580
+ """ ),
581
+ sort = dedent ("""\
582
+ sort : bool, default False
583
+ Sort `uniques` and shuffle `labels` to maintain the
584
+ relationship.
585
+ """ ),
586
+ size_hint = dedent ("""\
587
+ size_hint : int, optional
588
+ Hint to the hashtable sizer.
589
+ """ ),
590
+ )
591
+ @Appender (_shared_docs ['factorize' ])
592
+ @deprecate_kwarg (old_arg_name = 'order' , new_arg_name = None )
593
+ def factorize (values , sort = False , order = None , na_sentinel = - 1 , size_hint = None ):
498
594
# Implementation notes: This method is responsible for 3 things
499
595
# 1.) coercing data to array-like (ndarray, Index, extension array)
500
596
# 2.) factorizing labels and uniques
@@ -507,9 +603,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
507
603
values = _ensure_arraylike (values )
508
604
original = values
509
605
510
- if is_categorical_dtype (values ):
606
+ if is_extension_array_dtype (values ):
511
607
values = getattr (values , '_values' , values )
512
- labels , uniques = values .factorize ()
608
+ labels , uniques = values .factorize (na_sentinel = na_sentinel )
513
609
dtype = original .dtype
514
610
else :
515
611
values , dtype , _ = _ensure_data (values )
0 commit comments