From c2f490e283c1d6d1b4ebe5cfd829edaa5a572c45 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Wed, 16 Jul 2014 20:32:11 +0200 Subject: [PATCH 01/10] Categorical: preserve ints when NaN are present `Categorical([1, np.nan])` would end up with a single `1.` float level. This commit ensures that if `values` is a list of ints and contains np.nan, the float conversation does not take place. --- pandas/core/categorical.py | 20 ++++++++++++++- pandas/tests/test_categorical.py | 44 ++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index c9674aea4a715..1f367d7a88d5d 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -2,6 +2,7 @@ import numpy as np from warnings import warn +import types from pandas import compat from pandas.compat import u @@ -208,8 +209,25 @@ def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, # under certain versions of numpy as well inferred = com._possibly_infer_to_datetimelike(values) if not isinstance(inferred, np.ndarray): + + # Input sanitation... + if com._is_sequence(values) or isinstance(values, types.GeneratorType): + # isnull doesn't work with generators/xrange, so convert all to lists + # TODO: prevent allocation of two times the array/list be converting directly + values = list(values) + elif np.isscalar(values): + values = [values] + from pandas.core.series import _sanitize_array - values = _sanitize_array(values, None) + # On list with NaNs, int values will be converted to float. Use "object" dtype + # to prevent this. In the end objects will be casted to int/... in the level + # assignment step. + # tuple are list_like but com.isnull() will return a single bool, + # which then raises an AttributeError: 'bool' object has no attribute 'any' + has_null = (com.is_list_like(values) and not isinstance(values, tuple) + and com.isnull(values).any()) + dtype = 'object' if has_null else None + values = _sanitize_array(values, None, dtype=dtype) if levels is None: try: diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 421e05f5a3bc7..cf6f0bd38a02b 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -111,6 +111,50 @@ def test_constructor(self): cat = pd.Categorical([1,2,3,np.nan], levels=[1,2,3]) self.assertTrue(com.is_integer_dtype(cat.levels)) + # https://github.com/pydata/pandas/issues/3678 + cat = pd.Categorical([np.nan,1, 2, 3]) + self.assertTrue(com.is_integer_dtype(cat.levels)) + + # this should result in floats + cat = pd.Categorical([np.nan, 1, 2., 3 ]) + self.assertTrue(com.is_float_dtype(cat.levels)) + + cat = pd.Categorical([np.nan, 1., 2., 3. ]) + self.assertTrue(com.is_float_dtype(cat.levels)) + + # corner cases + cat = pd.Categorical([1]) + self.assertTrue(len(cat.levels) == 1) + self.assertTrue(cat.levels[0] == 1) + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + cat = pd.Categorical(["a"]) + self.assertTrue(len(cat.levels) == 1) + self.assertTrue(cat.levels[0] == "a") + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + # Scalars should be converted to lists + cat = pd.Categorical(1) + self.assertTrue(len(cat.levels) == 1) + self.assertTrue(cat.levels[0] == 1) + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + + def test_constructor_with_generator(self): + # This was raising an Error in isnull(single_val).any() because isnull returned a scalar + # for a generator + + a = (a for x in [1,2]) + cat = Categorical(a) + + # This does actually a xrange, which is a sequence instead of a generator + from pandas.core.index import MultiIndex + MultiIndex.from_product([range(5), ['a', 'b', 'c']]) + + def test_from_codes(self): # too few levels From 90a81dfda1f6616655d9b4aba0e8e2aea1ab317f Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Wed, 16 Jul 2014 22:05:38 +0200 Subject: [PATCH 02/10] Categorical: fix describe with np.nan --- pandas/core/categorical.py | 21 +++++++++++++++++--- pandas/tests/test_categorical.py | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 1f367d7a88d5d..bfa634fd794d6 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -934,14 +934,29 @@ def describe(self): 'values' : self._codes } ).groupby('codes').count() - counts.index = self.levels.take(counts.index) - counts = counts.reindex(self.levels) freqs = counts / float(counts.sum()) from pandas.tools.merge import concat result = concat([counts,freqs],axis=1) - result.index.name = 'levels' result.columns = ['counts','freqs'] + + # fill in the real levels + check = result.index == -1 + if check.any(): + # Sort -1 (=NaN) to the last position + index = np.arange(0, len(self.levels)+1) + index[-1] = -1 + result = result.reindex(index) + # build new index + levels = np.arange(0,len(self.levels)+1 ,dtype=object) + levels[:-1] = self.levels + levels[-1] = np.nan + result.index = levels.take(result.index) + else: + result.index = self.levels.take(result.index) + result = result.reindex(self.levels) + result.index.name = 'levels' + return result ##### utility routines ##### diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index cf6f0bd38a02b..faeb842e5f352 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -249,6 +249,16 @@ def test_describe(self): ).set_index('levels') tm.assert_frame_equal(desc, expected) + # check unused levels + cat = self.factor.copy() + cat.levels = ["a","b","c","d"] + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[3, 2, 3, np.nan], + freqs=[3/8., 2/8., 3/8., np.nan], + levels=['a', 'b', 'c', 'd']) + ).set_index('levels') + tm.assert_frame_equal(desc, expected) + # check an integer one desc = Categorical([1,2,3,1,2,3,3,2,1,1,1]).describe() expected = DataFrame.from_dict(dict(counts=[5, 3, 3], @@ -258,6 +268,29 @@ def test_describe(self): ).set_index('levels') tm.assert_frame_equal(desc, expected) + # https://github.com/pydata/pandas/issues/3678 + # describe should work with NaN + cat = pd.Categorical([np.nan,1, 2, 2]) + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[1, 2, 1], + freqs=[1/4., 2/4., 1/4.], + levels=[1,2,np.nan] + ) + ).set_index('levels') + tm.assert_frame_equal(desc, expected) + + # having NaN as level and as "not available" should also print two NaNs in describe! + cat = pd.Categorical([np.nan,1, 2, 2]) + cat.levels = [1,2,np.nan] + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[1, 2, np.nan, 1], + freqs=[1/4., 2/4., np.nan, 1/4.], + levels=[1,2,np.nan,np.nan] + ) + ).set_index('levels') + tm.assert_frame_equal(desc, expected) + + def test_print(self): expected = [" a", " b", " b", " a", " a", " c", " c", " c", "Levels (3, object): [a < b < c]"] From f4bf9ee5399661ff2aedd8b2531dd16ba92141a9 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Wed, 16 Jul 2014 22:28:55 +0200 Subject: [PATCH 03/10] Categorical: ensure that one can assign np.nan --- pandas/core/categorical.py | 5 +++-- pandas/tests/test_categorical.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index bfa634fd794d6..2bb8ac23c0100 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -7,7 +7,7 @@ from pandas import compat from pandas.compat import u -from pandas.core.algorithms import factorize, unique +from pandas.core.algorithms import factorize from pandas.core.base import PandasObject from pandas.core.index import Index, _ensure_index from pandas.core.indexing import _is_null_slice @@ -778,7 +778,8 @@ def __setitem__(self, key, value): rvalue = value if com.is_list_like(value) else [value] to_add = Index(rvalue)-self.levels - if len(to_add): + # no assignments of values not in levels, but it's always ok to set something to np.nan + if len(to_add) and not com.isnull(to_add).all(): raise ValueError("cannot setitem on a Categorical with a new level," " set the levels first") diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index faeb842e5f352..317fa158977eb 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -573,6 +573,13 @@ def test_slicing_directly(self): self.assert_numpy_array_equal(sliced._codes, expected._codes) tm.assert_index_equal(sliced.levels, expected.levels) + def test_set_item_nan(self): + cat = pd.Categorical([1,2,3]) + exp = pd.Categorical([1,np.nan,3], levels=[1,2,3]) + cat[1] = np.nan + self.assertTrue(cat.equals(exp)) + + class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True @@ -1550,6 +1557,12 @@ def f(): df.loc[2:3,"b"] = pd.Categorical(["b","b"], levels=["a","b"]) tm.assert_frame_equal(df, exp) + # ensure that one can set something to np.nan + s = Series(Categorical([1,2,3])) + exp = Series(Categorical([1,np.nan,3])) + s[1] = np.nan + tm.assert_series_equal(s, exp) + def test_concat(self): cat = pd.Categorical(["a","b"], levels=["a","b"]) From 130b61e4f5ed0342562b32fa089f1857c48d5af8 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Wed, 23 Jul 2014 21:55:16 +0200 Subject: [PATCH 04/10] Categorical: fix assigning NaN if NaN in levels --- pandas/core/categorical.py | 7 +++++++ pandas/tests/test_categorical.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 2bb8ac23c0100..43a6a80b75e47 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -805,6 +805,13 @@ def __setitem__(self, key, value): key = self._codes[key] lindexer = self.levels.get_indexer(rvalue) + + # float levels do currently return -1 for np.nan, even if np.nan is included in the index + # "repair" this here + if com.isnull(rvalue).any() and com.isnull(self.levels).any(): + nan_pos = np.where(com.isnull(self.levels)) + lindexer[lindexer == -1] = nan_pos + self._codes[key] = lindexer #### reduction ops #### diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 317fa158977eb..29c722bfe8660 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -579,6 +579,37 @@ def test_set_item_nan(self): cat[1] = np.nan self.assertTrue(cat.equals(exp)) + # if nan in levels, the proper code should be set! + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1] = np.nan + exp = np.array([0,3,2,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1:3] = np.nan + exp = np.array([0,3,3,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1:3] = [np.nan, 1] + exp = np.array([0,3,0,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1:3] = [np.nan, np.nan] + exp = np.array([0,3,3,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[pd.isnull(cat)] = np.nan + exp = np.array([0,1,2,3]) + self.assert_numpy_array_equal(cat.codes, exp) + class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True From 65d9d6ebcb94491cd5eed907a1aeb2dc05f65ce5 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Wed, 23 Jul 2014 21:56:53 +0200 Subject: [PATCH 05/10] API: change default Categorical.from_codes() to ordered=False In the normal constructor `ordered=True` is only assumed if the levels are given or the values are sortable (which is most of the cases), but in `from_codes(...)` we can't asssume this so the default should be `False`. --- pandas/core/categorical.py | 8 ++++---- pandas/tests/test_categorical.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 43a6a80b75e47..bfa44d565a869 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -110,9 +110,9 @@ class Categorical(PandasObject): Attributes ---------- - levels : ndarray + levels : Index The levels of this categorical - codes : Index + codes : ndarray The codes (integer positions, which point to the levels) of this categorical, read only ordered : boolean Whether or not this Categorical is ordered @@ -295,7 +295,7 @@ def from_array(cls, data): return Categorical(data) @classmethod - def from_codes(cls, codes, levels, ordered=True, name=None): + def from_codes(cls, codes, levels, ordered=False, name=None): """ Make a Categorical type from codes and levels arrays. @@ -312,7 +312,7 @@ def from_codes(cls, codes, levels, ordered=True, name=None): The levels for the categorical. Items need to be unique. ordered : boolean, optional Whether or not this categorical is treated as a ordered categorical. If not given, - the resulting categorical will be ordered. + the resulting categorical will be unordered. name : str, optional Name for the Categorical variable. """ diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 29c722bfe8660..815ac3d7b29da 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -178,7 +178,7 @@ def f(): self.assertRaises(ValueError, f) - exp = Categorical(["a","b","c"]) + exp = Categorical(["a","b","c"], ordered=False) res = Categorical.from_codes([0,1,2], ["a","b","c"]) self.assertTrue(exp.equals(res)) From 5c4f1bdf700d303ba27b73846602497677b0618a Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Thu, 24 Jul 2014 00:29:51 +0200 Subject: [PATCH 06/10] Categorical: add some links to Categorical in the other docs --- doc/source/10min.rst | 29 ++++++++++++++++++++++++++++- doc/source/categorical.rst | 1 + doc/source/reshaping.rst | 7 +++++++ doc/source/v0.15.0.txt | 3 ++- 4 files changed, 38 insertions(+), 2 deletions(-) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 985f112979a7e..6424b82779f0f 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -66,7 +66,8 @@ Creating a ``DataFrame`` by passing a dict of objects that can be converted to s 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), - 'E' : 'foo' }) + 'E' : pd.Categorical(["test","train","test","train"]), + 'F' : 'foo' }) df2 Having specific :ref:`dtypes ` @@ -635,6 +636,32 @@ the quarter end: ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 ts.head() +Categoricals +------------ + +Since version 0.15, pandas can include categorical data in a `DataFrame`. For full docs, see the +:ref:`Categorical introduction ` and the :ref:`API documentation ` . + +.. ipython:: python + + df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) + + # convert the raw grades to a categorical + df["grade"] = pd.Categorical(df["raw_grade"]) + + # Alternative: df["grade"] = df["raw_grade"].astype("category") + df["grade"] + + # Rename the levels + df["grade"].cat.levels = ["very good", "good", "very bad"] + + # Reorder the levels and simultaneously add the missing levels + df["grade"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) + df["grade"] + df.sort("grade") + df.groupby("grade").size() + + Plotting -------- diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index c08351eb87a79..c47653e92edb3 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -90,6 +90,7 @@ By using some special functions: df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) +See :ref:`documentation ` for :func:`~pandas.cut`. `Categoricals` have a specific ``category`` :ref:`dtype `: diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 92a35d0276e22..3d40be37dbbb3 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -503,3 +503,10 @@ handling of NaN: pd.factorize(x, sort=True) np.unique(x, return_inverse=True)[::-1] + +.. note:: + If you just want to handle one column as a categorical variable (like R's factor), + you can use ``df["cat_col"] = pd.Categorical(df["col"])`` or + ``df["cat_col"] = df["col"].astype("category")``. For full docs on :class:`~pandas.Categorical`, + see the :ref:`Categorical introduction ` and the + :ref:`API documentation `. This feature was introduced in version 0.15. diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index b91c306e9b193..c7a9aa5c3630b 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -225,7 +225,8 @@ Categoricals in Series/DataFrame methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, :issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`). -For full docs, see the :ref:`Categorical introduction ` and the :ref:`API documentation `. +For full docs, see the :ref:`Categorical introduction ` and the +:ref:`API documentation `. .. ipython:: python From 0438a30ae8464a3a3ddf35ca6bb5a9edbdb79a82 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Fri, 8 Aug 2014 22:22:26 +0200 Subject: [PATCH 07/10] Categorical: use s.values when calling private methods s.values is the underlying Categorical object, s.cat will be changed to only expose the API methods/properties. --- pandas/core/format.py | 2 +- pandas/core/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 8f749d07296a7..0539d803a42a4 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -177,7 +177,7 @@ def _get_footer(self): # level infos are added to the end and in a new line, like it is done for Categoricals # Only added when we request a name if self.name and com.is_categorical_dtype(self.series.dtype): - level_info = self.series.cat._repr_level_info() + level_info = self.series.values._repr_level_info() if footer: footer += "\n" footer += level_info diff --git a/pandas/core/series.py b/pandas/core/series.py index 5a490992c478c..25d80c27d7b02 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -900,7 +900,7 @@ def _repr_footer(self): # Categorical if com.is_categorical_dtype(self.dtype): - level_info = self.cat._repr_level_info() + level_info = self.values._repr_level_info() return u('%sLength: %d, dtype: %s\n%s') % (namestr, len(self), str(self.dtype.name), From 19f4d46555ddd20eea554c6ca62b777ca2d3460f Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Fri, 8 Aug 2014 22:29:43 +0200 Subject: [PATCH 08/10] Categorical: Change series.cat to only expose the API --- doc/source/api.rst | 11 ++++-- pandas/core/categorical.py | 30 +++++++++++++++- pandas/core/series.py | 5 +-- pandas/tests/test_categorical.py | 59 +++++++++++++++++--------------- 4 files changed, 71 insertions(+), 34 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index ec6e2aff870c6..a6a04af610ee0 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -528,11 +528,17 @@ and has the following usable methods and properties (all available as :toctree: generated/ Categorical - Categorical.from_codes Categorical.levels Categorical.ordered Categorical.reorder_levels Categorical.remove_unused_levels + +The following methods are considered API when using ``Categorical`` directly: + +.. autosummary:: + :toctree: generated/ + + Categorical.from_codes Categorical.min Categorical.max Categorical.mode @@ -547,7 +553,7 @@ the Categorical back to a numpy array, so levels and order information is not pr Categorical.__array__ To create compatibility with `pandas.Series` and `numpy` arrays, the following (non-API) methods -are also introduced. +are also introduced and available when ``Categorical`` is used directly. .. autosummary:: :toctree: generated/ @@ -564,7 +570,6 @@ are also introduced. Categorical.argsort Categorical.fillna - Plotting ~~~~~~~~ .. currentmodule:: pandas diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index bfa44d565a869..7c220afe243df 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -8,7 +8,7 @@ from pandas.compat import u from pandas.core.algorithms import factorize -from pandas.core.base import PandasObject +from pandas.core.base import PandasObject, PandasDelegate from pandas.core.index import Index, _ensure_index from pandas.core.indexing import _is_null_slice from pandas.tseries.period import PeriodIndex @@ -967,6 +967,34 @@ def describe(self): return result +##### The Series.cat accessor ##### + +class CategoricalProperties(PandasDelegate): + """ + This is a delegator class that passes thru limit property access + """ + + def __init__(self, values, index): + self.categorical = values + self.index = index + + def _delegate_property_get(self, name): + return getattr(self.categorical, name) + + def _delegate_property_set(self, name, new_values): + return setattr(self.categorical, name, new_values) + + def _delegate_method(self, name, *args, **kwargs): + method = getattr(self.categorical, name) + return method(*args, **kwargs) + +CategoricalProperties._add_delegate_accessors(delegate=Categorical, + accessors=["levels", "codes", "ordered"], + typ='property') +CategoricalProperties._add_delegate_accessors(delegate=Categorical, + accessors=["reorder_levels", "remove_unused_levels"], + typ='method') + ##### utility routines ##### def _get_codes_for_values(values, levels): diff --git a/pandas/core/series.py b/pandas/core/series.py index 25d80c27d7b02..ef6bdf99915b1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2415,11 +2415,12 @@ def dt(self): #------------------------------------------------------------------------------ # Categorical methods - @property + @cache_readonly def cat(self): + from pandas.core.categorical import CategoricalProperties if not com.is_categorical_dtype(self.dtype): raise TypeError("Can only use .cat accessor with a 'category' dtype") - return self.values + return CategoricalProperties(self.values, self.index) Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0}) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 815ac3d7b29da..892e4177b4bb5 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -290,6 +290,24 @@ def test_describe(self): ).set_index('levels') tm.assert_frame_equal(desc, expected) + # empty levels show up as NA + cat = Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True) + result = cat.describe() + + expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]], + columns=['counts','freqs'], + index=Index(['a','b','c'],name='levels')) + tm.assert_frame_equal(result,expected) + + # NA as a level + cat = pd.Categorical(["a","c","c",np.nan], levels=["b","a","c",np.nan] ) + result = cat.describe() + + expected = DataFrame([[np.nan, np.nan],[1,0.25],[2,0.5], [1,0.25]], + columns=['counts','freqs'], + index=Index(['b','a','c',np.nan],name='levels')) + tm.assert_frame_equal(result,expected) + def test_print(self): expected = [" a", " b", " b", " a", " a", " c", " c", " c", @@ -731,7 +749,7 @@ def test_sideeffects_free(self): # so this WILL change values cat = Categorical(["a","b","c","a"]) s = pd.Series(cat) - self.assertTrue(s.cat is cat) + self.assertTrue(s.values is cat) s.cat.levels = [1,2,3] exp_s = np.array([1,2,3,1]) self.assert_numpy_array_equal(s.__array__(), exp_s) @@ -747,20 +765,20 @@ def test_nan_handling(self): # Nans are represented as -1 in labels s = Series(Categorical(["a","b",np.nan,"a"])) self.assert_numpy_array_equal(s.cat.levels, np.array(["a","b"])) - self.assert_numpy_array_equal(s.cat._codes, np.array([0,1,-1,0])) + self.assert_numpy_array_equal(s.cat.codes, np.array([0,1,-1,0])) # If levels have nan included, the label should point to that instead s2 = Series(Categorical(["a","b",np.nan,"a"], levels=["a","b",np.nan])) self.assert_numpy_array_equal(s2.cat.levels, np.array(["a","b",np.nan], dtype=np.object_)) - self.assert_numpy_array_equal(s2.cat._codes, np.array([0,1,2,0])) + self.assert_numpy_array_equal(s2.cat.codes, np.array([0,1,2,0])) # Changing levels should also make the replaced level np.nan s3 = Series(Categorical(["a","b","c","a"])) s3.cat.levels = ["a","b",np.nan] self.assert_numpy_array_equal(s3.cat.levels, np.array(["a","b",np.nan], dtype=np.object_)) - self.assert_numpy_array_equal(s3.cat._codes, np.array([0,1,2,0])) + self.assert_numpy_array_equal(s3.cat.codes, np.array([0,1,2,0])) def test_sequence_like(self): @@ -770,8 +788,8 @@ def test_sequence_like(self): df['grade'] = Categorical(df['raw_grade']) # basic sequencing testing - result = list(df.grade.cat) - expected = np.array(df.grade.cat).tolist() + result = list(df.grade.values) + expected = np.array(df.grade.values).tolist() tm.assert_almost_equal(result,expected) # iteration @@ -813,7 +831,7 @@ def test_series_delegations(self): exp_values = np.array(["a","b","c","a"]) s.cat.reorder_levels(["c","b","a"]) self.assert_numpy_array_equal(s.cat.levels, exp_levels) - self.assert_numpy_array_equal(s.cat.__array__(), exp_values) + self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) # remove unused levels @@ -822,7 +840,7 @@ def test_series_delegations(self): exp_values = np.array(["a","b","b","a"]) s.cat.remove_unused_levels() self.assert_numpy_array_equal(s.cat.levels, exp_levels) - self.assert_numpy_array_equal(s.cat.__array__(), exp_values) + self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) # This method is likely to be confused, so test that it raises an error on wrong inputs: @@ -881,31 +899,16 @@ def test_describe(self): result = self.cat.describe() self.assertEquals(len(result.columns),1) - # empty levels show up as NA - s = Series(Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True)) - result = s.cat.describe() - expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]], - columns=['counts','freqs'], - index=Index(['a','b','c'],name='levels')) - tm.assert_frame_equal(result,expected) + # In a frame, describe() for the cat should be the same as for string arrays (count, unique, + # top, freq) + cat = Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True) + s = Series(cat) result = s.describe() expected = Series([4,2,"b",3],index=['count','unique','top', 'freq']) tm.assert_series_equal(result,expected) - # NA as a level - cat = pd.Categorical(["a","c","c",np.nan], levels=["b","a","c",np.nan] ) - result = cat.describe() - - expected = DataFrame([[np.nan, np.nan],[1,0.25],[2,0.5], [1,0.25]], - columns=['counts','freqs'], - index=Index(['b','a','c',np.nan],name='levels')) - tm.assert_frame_equal(result,expected) - - - # In a frame, describe() for the cat should be the same as for string arrays (count, unique, - # top, freq) cat = pd.Series(pd.Categorical(["a","b","c","c"])) df3 = pd.DataFrame({"cat":cat, "s":["a","b","c","c"]}) res = df3.describe() @@ -1085,7 +1088,7 @@ def test_sort(self): # Cats must be sorted in a dataframe res = df.sort(columns=["string"], ascending=False) exp = np.array(["d", "c", "b", "a"]) - self.assert_numpy_array_equal(res["sort"].cat.__array__(), exp) + self.assert_numpy_array_equal(res["sort"].values.__array__(), exp) self.assertEqual(res["sort"].dtype, "category") res = df.sort(columns=["sort"], ascending=False) From 47953a296167d5f845e681a56228744e9fd8a2ca Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Fri, 8 Aug 2014 23:00:00 +0200 Subject: [PATCH 09/10] Categorical: Fix order and na_position --- pandas/core/categorical.py | 19 ++++++++++++++++++- pandas/tests/test_categorical.py | 24 ++++++++++++++++++------ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 7c220afe243df..9aa801ba8336d 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -521,10 +521,27 @@ def order(self, inplace=False, ascending=True, na_position='last', **kwargs): if na_position not in ['last','first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) - codes = np.sort(self._codes.copy()) + codes = np.sort(self._codes) if not ascending: codes = codes[::-1] + # NaN handling + na_mask = (codes==-1) + if na_mask.any(): + n_nans = len(codes[na_mask]) + if na_position=="first" and not ascending: + # in this case sort to the front + new_codes = codes.copy() + new_codes[0:n_nans] = -1 + new_codes[n_nans:] = codes[~na_mask] + codes = new_codes + elif na_position=="last" and not ascending: + # ... and to the end + new_codes = codes.copy() + pos = len(codes)-n_nans + new_codes[0:pos] = codes[~na_mask] + new_codes[pos:] = -1 + codes = new_codes if inplace: self._codes = codes return diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 892e4177b4bb5..f5b20e924cdf6 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1131,17 +1131,29 @@ def f(): res = cat.order(ascending=False, na_position='last') exp_val = np.array(["d","c","b","a", np.nan],dtype=object) exp_levels = np.array(["a","b","c","d"],dtype=object) - # FIXME: IndexError: Out of bounds on buffer access (axis 0) - #self.assert_numpy_array_equal(res.__array__(), exp_val) - #self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) + + cat = Categorical(["a","c","b","d", np.nan], ordered=True) + res = cat.order(ascending=False, na_position='first') + exp_val = np.array([np.nan, "d","c","b","a"],dtype=object) + exp_levels = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) cat = Categorical(["a","c","b","d", np.nan], ordered=True) res = cat.order(ascending=False, na_position='first') exp_val = np.array([np.nan, "d","c","b","a"],dtype=object) exp_levels = np.array(["a","b","c","d"],dtype=object) - # FIXME: IndexError: Out of bounds on buffer access (axis 0) - #self.assert_numpy_array_equal(res.__array__(), exp_val) - #self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) + + cat = Categorical(["a","c","b","d", np.nan], ordered=True) + res = cat.order(ascending=False, na_position='last') + exp_val = np.array(["d","c","b","a",np.nan],dtype=object) + exp_levels = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) def test_slicing(self): cat = Series(Categorical([1,2,3,4])) From 2958ce143428a9db2a411015d03ac61f9b50e7ef Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Sat, 9 Aug 2014 00:55:39 +0200 Subject: [PATCH 10/10] Categorical: Fix comparison of Categoricals and Series|Categorical|np.array Categorical can only be comapred to another Categorical with the same levels and the same ordering or to a scalar value. If the Categorical has no order defined (cat.ordered == False), only equal (and not equal) are defined. --- doc/source/categorical.rst | 44 ++++++++++++- pandas/core/categorical.py | 44 +++++++++++-- pandas/core/common.py | 13 +++- pandas/core/ops.py | 11 +++- pandas/tests/test_categorical.py | 107 +++++++++++++++++++++++++++++++ 5 files changed, 207 insertions(+), 12 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index c47653e92edb3..831093228b5d6 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -332,6 +332,45 @@ Operations The following operations are possible with categorical data: +Comparing `Categoricals` with other objects is possible in two cases: + * comparing a `Categorical` to another `Categorical`, when `level` and `ordered` is the same or + * comparing a `Categorical` to a scalar. +All other comparisons will raise a TypeError. + +.. ipython:: python + + cat = pd.Series(pd.Categorical([1,2,3], levels=[3,2,1])) + cat_base = pd.Series(pd.Categorical([2,2,2], levels=[3,2,1])) + cat_base2 = pd.Series(pd.Categorical([2,2,2])) + + cat > cat_base + + # This doesn't work because the levels are not the same + try: + cat > cat_base2 + except TypeError as e: + print("TypeError: " + str(e)) + + cat > 2 + +.. note:: + + Comparisons with `Series`, `np.array` or a `Categorical` with different levels or ordering + will raise an `TypeError` because custom level ordering would result in two valid results: + one with taking in account the ordering and one without. If you want to compare a `Categorical` + with such a type, you need to be explicit and convert the `Categorical` to values: + +.. ipython:: python + + base = np.array([1,2,3]) + + try: + cat > base + except TypeError as e: + print("TypeError: " + str(e)) + + np.asarray(cat) > base + Getting the minimum and maximum, if the categorical is ordered: .. ipython:: python @@ -510,7 +549,8 @@ The same applies to ``df.append(df)``. Getting Data In/Out ------------------- -Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype will currently raise ``NotImplementedError``. +Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype will currently +raise ``NotImplementedError``. Writing to a CSV file will convert the data, effectively removing any information about the `Categorical` (levels and ordering). So if you read back the CSV file you have to convert the @@ -580,7 +620,7 @@ object and not as a low level `numpy` array dtype. This leads to some problems. try: np.dtype("category") except TypeError as e: - print("TypeError: " + str(e)) + print("TypeError: " + str(e)) dtype = pd.Categorical(["a"]).dtype try: diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 9aa801ba8336d..91713ab3bc576 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -19,16 +19,36 @@ def _cat_compare_op(op): def f(self, other): - if isinstance(other, (Categorical, np.ndarray)): - values = np.asarray(self) - f = getattr(values, op) - return f(np.asarray(other)) - else: + # On python2, you can usually compare any type to any type, and Categoricals can be + # seen as a custom type, but having different results depending whether a level are + # the same or not is kind of insane, so be a bit stricter here and use the python3 idea + # of comparing only things of equal type. + if not self.ordered: + if op in ['__lt__', '__gt__','__le__','__ge__']: + raise TypeError("Unordered Categoricals can only compare equality or not") + if isinstance(other, Categorical): + # Two Categoricals can only be be compared if the levels are the same + if (len(self.levels) != len(other.levels)) or not ((self.levels == other.levels).all()): + raise TypeError("Categoricals can only be compared if 'levels' are the same") + if not (self.ordered == other.ordered): + raise TypeError("Categoricals can only be compared if 'ordered' is the same") + na_mask = (self._codes == -1) | (other._codes == -1) + f = getattr(self._codes, op) + ret = f(other._codes) + if na_mask.any(): + # In other series, the leads to False, so do that here too + ret[na_mask] = False + return ret + elif np.isscalar(other): if other in self.levels: i = self.levels.get_loc(other) return getattr(self._codes, op)(i) else: return np.repeat(False, len(self)) + else: + msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ + "compare values, use 'np.asarray(cat) other'." + raise TypeError(msg.format(op=op,typ=type(other))) f.__name__ = op @@ -172,6 +192,9 @@ class Categorical(PandasObject): Categorical.max """ + # For comparisons, so that numpy uses our implementation if the compare ops, which raise + __array_priority__ = 1000 + def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): if fastpath: @@ -447,9 +470,16 @@ def __array__(self, dtype=None): Returns ------- values : numpy array - A numpy array of the same dtype as categorical.levels.dtype + A numpy array of either the specified dtype or, if dtype==None (default), the same + dtype as categorical.levels.dtype """ - return com.take_1d(self.levels.values, self._codes) + ret = com.take_1d(self.levels.values, self._codes) + if dtype and dtype != self.levels.dtype: + return np.asarray(ret, dtype) + return ret + + def astype(self, dtype, order='K', casting='unsafe', subok=True, copy=True): + return np.asarray(self, dtype) @property def T(self): diff --git a/pandas/core/common.py b/pandas/core/common.py index bc4c95ed3323e..9e04e38b9c4e2 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -275,7 +275,9 @@ def _isnull_ndarraylike(obj): values = getattr(obj, 'values', obj) dtype = values.dtype - if dtype.kind in ('O', 'S', 'U'): + if is_categorical_dtype(values): + result = _isnull_categorical(values) + elif dtype.kind in ('O', 'S', 'U'): # Working around NumPy ticket 1542 shape = values.shape @@ -285,7 +287,6 @@ def _isnull_ndarraylike(obj): result = np.empty(shape, dtype=bool) vec = lib.isnullobj(values.ravel()) result[...] = vec.reshape(shape) - elif dtype in _DATELIKE_DTYPES: # this is the NaT pattern result = values.view('i8') == tslib.iNaT @@ -299,6 +300,14 @@ def _isnull_ndarraylike(obj): return result +def _isnull_categorical(obj): + ret = obj._codes == -1 + # String/object and float levels can hold np.nan + if obj.levels.dtype.kind in ('S', 'O' 'f'): + if np.nan in obj.levels: + nan_pos = np.where(com.isnull(self.levels)) + ret = ret | obj == nan_pos + return ret def _isnull_ndarraylike_old(obj): values = getattr(obj, 'values', obj) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 9f29570af6f4f..de3b8d857617f 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -524,6 +524,10 @@ def _comp_method_SERIES(op, name, str_rep, masker=False): code duplication. """ def na_op(x, y): + if com.is_categorical_dtype(x) != com.is_categorical_dtype(y): + msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ + "compare values, use 'series np.asarray(cat)'." + raise TypeError(msg.format(op=op,typ=type(y))) if x.dtype == np.object_: if isinstance(y, list): y = lib.list_to_object_array(y) @@ -555,11 +559,16 @@ def wrapper(self, other): index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented - elif isinstance(other, (pa.Array, pd.Series, pd.Index)): + elif isinstance(other, (pa.Array, pd.Index)): if len(self) != len(other): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) + elif isinstance(other, pd.Categorical): + if not com.is_categorical_dtype(self): + msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\ + "If you want to compare values, use 'series np.asarray(other)'." + raise TypeError(msg.format(op=op,typ=self.dtype)) else: mask = isnull(self) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index f5b20e924cdf6..dbfea95bb58c8 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -223,6 +223,62 @@ def test_comparisons(self): expected = np.repeat(False, len(self.factor)) self.assert_numpy_array_equal(result, expected) + # comparisons with categoricals + cat_rev = pd.Categorical(["a","b","c"], levels=["c","b","a"]) + cat_rev_base = pd.Categorical(["b","b","b"], levels=["c","b","a"]) + cat = pd.Categorical(["a","b","c"]) + cat_base = pd.Categorical(["b","b","b"], levels=cat.levels) + + # comparisons need to take level ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = np.array([True, False, False]) + self.assert_numpy_array_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = np.array([False, False, True]) + self.assert_numpy_array_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = np.array([False, False, True]) + self.assert_numpy_array_equal(res, exp) + + # Only categories with same levels can be compared + def f(): + cat > cat_rev + self.assertRaises(TypeError, f) + + cat_rev_base2 = pd.Categorical(["b","b","b"], levels=["c","b","a","d"]) + def f(): + cat_rev > cat_rev_base2 + self.assertRaises(TypeError, f) + + # Only categories with same ordering information can be compared + cat_unorderd = cat.copy() + cat_unorderd.ordered = False + self.assertFalse((cat > cat).any()) + def f(): + cat > cat_unorderd + self.assertRaises(TypeError, f) + + # comparison (in both directions) with Series will raise + s = Series(["b","b","b"]) + self.assertRaises(TypeError, lambda: cat > s) + self.assertRaises(TypeError, lambda: cat_rev > s) + self.assertRaises(TypeError, lambda: s < cat) + self.assertRaises(TypeError, lambda: s < cat_rev) + + # comparison with numpy.array will raise in both direction, but only on newer + # numpy versions + a = np.array(["b","b","b"]) + self.assertRaises(TypeError, lambda: cat > a) + self.assertRaises(TypeError, lambda: cat_rev > a) + + # The following work via '__array_priority__ = 1000' + # but only on numpy > 1.6.1? + tm._skip_if_not_numpy17_friendly() + self.assertRaises(TypeError, lambda: a < cat) + self.assertRaises(TypeError, lambda: a < cat_rev) + def test_na_flags_int_levels(self): # #1457 @@ -1609,6 +1665,57 @@ def f(): s[1] = np.nan tm.assert_series_equal(s, exp) + def test_comparisons(self): + tests_data = [(list("abc"), list("cba"), list("bbb")), + ([1,2,3], [3,2,1], [2,2,2])] + for data , reverse, base in tests_data: + cat_rev = pd.Series(pd.Categorical(data, levels=reverse)) + cat_rev_base = pd.Series(pd.Categorical(base, levels=reverse)) + cat = pd.Series(pd.Categorical(data)) + cat_base = pd.Series(pd.Categorical(base, levels=cat.cat.levels)) + s = Series(base) + a = np.array(base) + + # comparisons need to take level ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = Series([True, False, False]) + tm.assert_series_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = Series([False, False, True]) + tm.assert_series_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = Series([False, False, True]) + tm.assert_series_equal(res, exp) + + # Only categories with same levels can be compared + def f(): + cat > cat_rev + self.assertRaises(TypeError, f) + + # categorical cannot be compared to Series or numpy array, and also not the other way + # around + self.assertRaises(TypeError, lambda: cat > s) + self.assertRaises(TypeError, lambda: cat_rev > s) + self.assertRaises(TypeError, lambda: cat > a) + self.assertRaises(TypeError, lambda: cat_rev > a) + + self.assertRaises(TypeError, lambda: s < cat) + self.assertRaises(TypeError, lambda: s < cat_rev) + + self.assertRaises(TypeError, lambda: a < cat) + self.assertRaises(TypeError, lambda: a < cat_rev) + + # Categoricals can be compared to scalar values + res = cat_rev > base[0] + tm.assert_series_equal(res, exp) + + # And test NaN handling... + cat = pd.Series(pd.Categorical(["a","b","c", np.nan])) + exp = Series([True, True, True, False]) + res = (cat == cat) + tm.assert_series_equal(res, exp) def test_concat(self): cat = pd.Categorical(["a","b"], levels=["a","b"])