diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 985f112979a7e..6b1bfdf7b241d 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -66,7 +66,8 @@ Creating a ``DataFrame`` by passing a dict of objects that can be converted to s 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), - 'E' : 'foo' }) + 'E' : pd.Categorical(["test","train","test","train"]), + 'F' : 'foo' }) df2 Having specific :ref:`dtypes ` @@ -635,6 +636,32 @@ the quarter end: ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 ts.head() +Categoricals +------------ + +Since version 0.15, pandas can include categorical data in a ``DataFrame``. For full docs, see the +:ref:`Categorical introduction ` and the :ref:`API documentation ` . + +.. ipython:: python + + df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) + + # convert the raw grades to a categorical + df["grade"] = pd.Categorical(df["raw_grade"]) + + # Alternative: df["grade"] = df["raw_grade"].astype("category") + df["grade"] + + # Rename the levels + df["grade"].cat.levels = ["very good", "good", "very bad"] + + # Reorder the levels and simultaneously add the missing levels + df["grade"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) + df["grade"] + df.sort("grade") + df.groupby("grade").size() + + Plotting -------- diff --git a/doc/source/api.rst b/doc/source/api.rst index feb4da700354d..017739adbc8b1 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -521,51 +521,33 @@ Categorical .. currentmodule:: pandas.core.categorical If the Series is of dtype ``category``, ``Series.cat`` can be used to access the the underlying -``Categorical``. This data type is similar to the otherwise underlying numpy array -and has the following usable methods and properties (all available as -``Series.cat.``). - +``Categorical``. This accessor is similar to the ``Series.dt`` or ``Series.str``and has the +following usable methods and properties (all available as ``Series.cat.``). .. autosummary:: :toctree: generated/ - Categorical - Categorical.from_codes Categorical.levels Categorical.ordered Categorical.reorder_levels Categorical.remove_unused_levels - Categorical.min - Categorical.max - Categorical.mode - Categorical.describe -``np.asarray(categorical)`` works by implementing the array interface. Be aware, that this converts -the Categorical back to a numpy array, so levels and order information is not preserved! +The following methods are considered API when using ``Categorical`` directly: .. autosummary:: :toctree: generated/ - Categorical.__array__ + Categorical + Categorical.from_codes + Categorical.codes -To create compatibility with `pandas.Series` and `numpy` arrays, the following (non-API) methods -are also introduced. +``np.asarray(categorical)`` works by implementing the array interface. Be aware, that this converts +the Categorical back to a numpy array, so levels and order information is not preserved! .. autosummary:: :toctree: generated/ - Categorical.from_array - Categorical.get_values - Categorical.copy - Categorical.dtype - Categorical.ndim - Categorical.sort - Categorical.equals - Categorical.unique - Categorical.order - Categorical.argsort - Categorical.fillna - + Categorical.__array__ Plotting ~~~~~~~~ diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index c08351eb87a79..6ed1a7982a64b 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -90,6 +90,7 @@ By using some special functions: df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) +See :ref:`documentation ` for :func:`~pandas.cut`. `Categoricals` have a specific ``category`` :ref:`dtype `: @@ -331,6 +332,57 @@ Operations The following operations are possible with categorical data: +Comparing `Categoricals` with other objects is possible in two cases: + + * comparing a `Categorical` to another `Categorical`, when `level` and `ordered` is the same or + * comparing a `Categorical` to a scalar. + +All other comparisons will raise a TypeError. + +.. ipython:: python + + cat = pd.Series(pd.Categorical([1,2,3], levels=[3,2,1])) + cat_base = pd.Series(pd.Categorical([2,2,2], levels=[3,2,1])) + cat_base2 = pd.Series(pd.Categorical([2,2,2])) + + cat + cat_base + cat_base2 + +Comparing to a categorical with the same levels and ordering or to a scalar works: + +.. ipython:: python + + cat > cat_base + cat > 2 + +This doesn't work because the levels are not the same: + +.. ipython:: python + + try: + cat > cat_base2 + except TypeError as e: + print("TypeError: " + str(e)) + +.. note:: + + Comparisons with `Series`, `np.array` or a `Categorical` with different levels or ordering + will raise an `TypeError` because custom level ordering would result in two valid results: + one with taking in account the ordering and one without. If you want to compare a `Categorical` + with such a type, you need to be explicit and convert the `Categorical` to values: + +.. ipython:: python + + base = np.array([1,2,3]) + + try: + cat > base + except TypeError as e: + print("TypeError: " + str(e)) + + np.asarray(cat) > base + Getting the minimum and maximum, if the categorical is ordered: .. ipython:: python @@ -489,34 +541,38 @@ but the levels of these `Categoricals` need to be the same: .. ipython:: python - cat = pd.Categorical(["a","b"], levels=["a","b"]) - vals = [1,2] - df = pd.DataFrame({"cats":cat, "vals":vals}) - res = pd.concat([df,df]) - res - res.dtypes + cat = pd.Categorical(["a","b"], levels=["a","b"]) + vals = [1,2] + df = pd.DataFrame({"cats":cat, "vals":vals}) + res = pd.concat([df,df]) + res + res.dtypes - df_different = df.copy() - df_different["cats"].cat.levels = ["a","b","c"] +In this case the levels are not the same and so an error is raised: - try: - pd.concat([df,df]) - except ValueError as e: - print("ValueError: " + str(e)) +.. ipython:: python + + df_different = df.copy() + df_different["cats"].cat.levels = ["a","b","c"] + try: + pd.concat([df,df_different]) + except ValueError as e: + print("ValueError: " + str(e)) The same applies to ``df.append(df)``. Getting Data In/Out ------------------- -Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype will currently raise ``NotImplementedError``. +Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype will currently +raise ``NotImplementedError``. Writing to a CSV file will convert the data, effectively removing any information about the `Categorical` (levels and ordering). So if you read back the CSV file you have to convert the relevant columns back to `category` and assign the right levels and level ordering. .. ipython:: python - :suppress: + :suppress: from pandas.compat import StringIO @@ -548,7 +604,7 @@ default not included in computations. See the :ref:`Missing Data section ` There are two ways a `np.nan` can be represented in `Categorical`: either the value is not -available or `np.nan` is a valid level. +available ("missing value") or `np.nan` is a valid level. .. ipython:: python @@ -560,9 +616,25 @@ available or `np.nan` is a valid level. s2.cat.levels = [1,2,np.nan] s2 # three levels, np.nan included - # Note: as int arrays can't hold NaN the levels were converted to float + # Note: as int arrays can't hold NaN the levels were converted to object s2.cat.levels +.. note:: + Missing value methods like ``isnull`` and ``fillna`` will take both missing values as well as + `np.nan` levels into account: + +.. ipython:: python + + c = pd.Categorical(["a","b",np.nan]) + c.levels = ["a","b",np.nan] + # will be inserted as a NA level: + c[0] = np.nan + s = pd.Series(c) + s + pd.isnull(s) + s.fillna("a") + + Gotchas ------- @@ -579,7 +651,7 @@ object and not as a low level `numpy` array dtype. This leads to some problems. try: np.dtype("category") except TypeError as e: - print("TypeError: " + str(e)) + print("TypeError: " + str(e)) dtype = pd.Categorical(["a"]).dtype try: @@ -587,7 +659,10 @@ object and not as a low level `numpy` array dtype. This leads to some problems. except TypeError as e: print("TypeError: " + str(e)) - # dtype comparisons work: +Dtype comparisons work: + +.. ipython:: python + dtype == np.str_ np.str_ == dtype diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 8d718bacd262b..bcdb9ada15bb3 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -505,3 +505,10 @@ handling of NaN: pd.factorize(x, sort=True) np.unique(x, return_inverse=True)[::-1] + +.. note:: + If you just want to handle one column as a categorical variable (like R's factor), + you can use ``df["cat_col"] = pd.Categorical(df["col"])`` or + ``df["cat_col"] = df["col"].astype("category")``. For full docs on :class:`~pandas.Categorical`, + see the :ref:`Categorical introduction ` and the + :ref:`API documentation `. This feature was introduced in version 0.15. diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 0223a11d8a011..1186c6e4ade39 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -283,9 +283,10 @@ Categoricals in Series/DataFrame :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, -:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`). +:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`). -For full docs, see the :ref:`Categorical introduction ` and the :ref:`API documentation `. +For full docs, see the :ref:`Categorical introduction ` and the +:ref:`API documentation `. .. ipython:: python diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index c9674aea4a715..28cbdc0d5634e 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -2,12 +2,13 @@ import numpy as np from warnings import warn +import types from pandas import compat from pandas.compat import u -from pandas.core.algorithms import factorize, unique -from pandas.core.base import PandasObject +from pandas.core.algorithms import factorize +from pandas.core.base import PandasObject, PandasDelegate from pandas.core.index import Index, _ensure_index from pandas.core.indexing import _is_null_slice from pandas.tseries.period import PeriodIndex @@ -18,16 +19,36 @@ def _cat_compare_op(op): def f(self, other): - if isinstance(other, (Categorical, np.ndarray)): - values = np.asarray(self) - f = getattr(values, op) - return f(np.asarray(other)) - else: + # On python2, you can usually compare any type to any type, and Categoricals can be + # seen as a custom type, but having different results depending whether a level are + # the same or not is kind of insane, so be a bit stricter here and use the python3 idea + # of comparing only things of equal type. + if not self.ordered: + if op in ['__lt__', '__gt__','__le__','__ge__']: + raise TypeError("Unordered Categoricals can only compare equality or not") + if isinstance(other, Categorical): + # Two Categoricals can only be be compared if the levels are the same + if (len(self.levels) != len(other.levels)) or not ((self.levels == other.levels).all()): + raise TypeError("Categoricals can only be compared if 'levels' are the same") + if not (self.ordered == other.ordered): + raise TypeError("Categoricals can only be compared if 'ordered' is the same") + na_mask = (self._codes == -1) | (other._codes == -1) + f = getattr(self._codes, op) + ret = f(other._codes) + if na_mask.any(): + # In other series, the leads to False, so do that here too + ret[na_mask] = False + return ret + elif np.isscalar(other): if other in self.levels: i = self.levels.get_loc(other) return getattr(self._codes, op)(i) else: return np.repeat(False, len(self)) + else: + msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ + "compare values, use 'np.asarray(cat) other'." + raise TypeError(msg.format(op=op,typ=type(other))) f.__name__ = op @@ -109,9 +130,9 @@ class Categorical(PandasObject): Attributes ---------- - levels : ndarray + levels : Index The levels of this categorical - codes : Index + codes : ndarray The codes (integer positions, which point to the levels) of this categorical, read only ordered : boolean Whether or not this Categorical is ordered @@ -171,6 +192,9 @@ class Categorical(PandasObject): Categorical.max """ + # For comparisons, so that numpy uses our implementation if the compare ops, which raise + __array_priority__ = 1000 + def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): if fastpath: @@ -206,10 +230,15 @@ def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well - inferred = com._possibly_infer_to_datetimelike(values) - if not isinstance(inferred, np.ndarray): + values = com._possibly_infer_to_datetimelike(values) + if not isinstance(values, np.ndarray): + values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array - values = _sanitize_array(values, None) + # On list with NaNs, int values will be converted to float. Use "object" dtype + # to prevent this. In the end objects will be casted to int/... in the level + # assignment step. + dtype = 'object' if com.isnull(values).any() else None + values = _sanitize_array(values, None, dtype=dtype) if levels is None: try: @@ -277,7 +306,7 @@ def from_array(cls, data): return Categorical(data) @classmethod - def from_codes(cls, codes, levels, ordered=True, name=None): + def from_codes(cls, codes, levels, ordered=False, name=None): """ Make a Categorical type from codes and levels arrays. @@ -294,7 +323,7 @@ def from_codes(cls, codes, levels, ordered=True, name=None): The levels for the categorical. Items need to be unique. ordered : boolean, optional Whether or not this categorical is treated as a ordered categorical. If not given, - the resulting categorical will be ordered. + the resulting categorical will be unordered. name : str, optional Name for the Categorical variable. """ @@ -333,12 +362,34 @@ def _set_codes(self, codes): codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) + def _get_labels(self): + """ Get the level labels (deprecated). + + Deprecated, use .codes! + """ + import warnings + warnings.warn("'labels' is deprecated. Use 'codes' instead", FutureWarning) + return self.codes + + labels = property(fget=_get_labels, fset=_set_codes) + _levels = None @classmethod def _validate_levels(cls, levels): """" Validates that we have good levels """ - levels = _ensure_index(levels) + if not isinstance(levels, Index): + dtype = None + if not hasattr(levels, "dtype"): + levels = _convert_to_list_like(levels) + # on levels with NaNs, int values would be converted to float. Use "object" dtype + # to prevent this. + if com.isnull(levels).any(): + without_na = np.array([x for x in levels if com.notnull(x)]) + with_na = np.array(levels) + if with_na.dtype != without_na.dtype: + dtype = "object" + levels = Index(levels, dtype=dtype) if not levels.is_unique: raise ValueError('Categorical levels must be unique') return levels @@ -429,14 +480,61 @@ def __array__(self, dtype=None): Returns ------- values : numpy array - A numpy array of the same dtype as categorical.levels.dtype + A numpy array of either the specified dtype or, if dtype==None (default), the same + dtype as categorical.levels.dtype """ - return com.take_1d(self.levels.values, self._codes) + ret = com.take_1d(self.levels.values, self._codes) + if dtype and dtype != self.levels.dtype: + return np.asarray(ret, dtype) + return ret @property def T(self): return self + def isnull(self): + """ + Detect missing values + + Both missing values (-1 in .codes) and NA as a level are detected. + + Returns + ------- + a boolean array of whether my values are null + + See also + -------- + pandas.isnull : pandas version + Categorical.notnull : boolean inverse of Categorical.isnull + """ + + ret = self._codes == -1 + + # String/object and float levels can hold np.nan + if self.levels.dtype.kind in ['S', 'O', 'f']: + if np.nan in self.levels: + nan_pos = np.where(com.isnull(self.levels)) + # we only have one NA in levels + ret = np.logical_or(ret , self._codes == nan_pos[0]) + return ret + + def notnull(self): + """ + Reverse of isnull + + Both missing values (-1 in .codes) and NA as a level are detected as null. + + Returns + ------- + a boolean array of whether my values are not null + + See also + -------- + pandas.notnull : pandas version + Categorical.isnull : boolean inverse of Categorical.notnull + """ + return ~self.isnull() + def get_values(self): """ Return the values. @@ -503,10 +601,27 @@ def order(self, inplace=False, ascending=True, na_position='last', **kwargs): if na_position not in ['last','first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) - codes = np.sort(self._codes.copy()) + codes = np.sort(self._codes) if not ascending: codes = codes[::-1] + # NaN handling + na_mask = (codes==-1) + if na_mask.any(): + n_nans = len(codes[na_mask]) + if na_position=="first" and not ascending: + # in this case sort to the front + new_codes = codes.copy() + new_codes[0:n_nans] = -1 + new_codes[n_nans:] = codes[~na_mask] + codes = new_codes + elif na_position=="last" and not ascending: + # ... and to the end + new_codes = codes.copy() + pos = len(codes)-n_nans + new_codes[0:pos] = codes[~na_mask] + new_codes[pos:] = -1 + codes = new_codes if inplace: self._codes = codes return @@ -595,6 +710,15 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs): values = self._codes + # Make sure that we also get NA in levels + if self.levels.dtype.kind in ['S', 'O', 'f']: + if np.nan in self.levels: + values = values.copy() + nan_pos = np.where(com.isnull(self.levels)) + # we only have one NA in levels + values[values == nan_pos[0]] = -1 + + # pad / bfill if method is not None: @@ -608,9 +732,9 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs): if not com.isnull(fill_value) and fill_value not in self.levels: raise ValueError("fill value must be in levels") - mask = self._codes==-1 + mask = values==-1 if mask.any(): - values = self._codes.copy() + values = values.copy() values[mask] = self.levels.get_loc(fill_value) return Categorical(values, levels=self.levels, ordered=self.ordered, @@ -760,7 +884,8 @@ def __setitem__(self, key, value): rvalue = value if com.is_list_like(value) else [value] to_add = Index(rvalue)-self.levels - if len(to_add): + # no assignments of values not in levels, but it's always ok to set something to np.nan + if len(to_add) and not com.isnull(to_add).all(): raise ValueError("cannot setitem on a Categorical with a new level," " set the levels first") @@ -768,9 +893,8 @@ def __setitem__(self, key, value): if isinstance(key, (int, np.integer)): pass - # tuple of indexers + # tuple of indexers (dataframe) elif isinstance(key, tuple): - # only allow 1 dimensional slicing, but can # in a 2-d case be passd (slice(None),....) if len(key) == 2: @@ -782,10 +906,28 @@ def __setitem__(self, key, value): else: raise AssertionError("invalid slicing for a 1-ndim categorical") + # slicing in Series or Categorical + elif isinstance(key, slice): + pass + + # Array of True/False in Series or Categorical else: - key = self._codes[key] + # There is a bug in numpy, which does not accept a Series as a indexer + # https://github.com/pydata/pandas/issues/6168 + # https://github.com/numpy/numpy/issues/4240 -> fixed in numpy 1.9 + # FIXME: remove when numpy 1.9 is the lowest numpy version pandas accepts... + key = np.asarray(key) lindexer = self.levels.get_indexer(rvalue) + + # FIXME: the following can be removed after https://github.com/pydata/pandas/issues/7820 + # is fixed. + # float levels do currently return -1 for np.nan, even if np.nan is included in the index + # "repair" this here + if com.isnull(rvalue).any() and com.isnull(self.levels).any(): + nan_pos = np.where(com.isnull(self.levels)) + lindexer[lindexer == -1] = nan_pos + self._codes[key] = lindexer #### reduction ops #### @@ -916,16 +1058,67 @@ def describe(self): 'values' : self._codes } ).groupby('codes').count() - counts.index = self.levels.take(counts.index) - counts = counts.reindex(self.levels) freqs = counts / float(counts.sum()) from pandas.tools.merge import concat result = concat([counts,freqs],axis=1) - result.index.name = 'levels' result.columns = ['counts','freqs'] + + # fill in the real levels + check = result.index == -1 + if check.any(): + # Sort -1 (=NaN) to the last position + index = np.arange(0, len(self.levels)+1) + index[-1] = -1 + result = result.reindex(index) + # build new index + levels = np.arange(0,len(self.levels)+1 ,dtype=object) + levels[:-1] = self.levels + levels[-1] = np.nan + result.index = levels.take(result.index) + else: + result.index = self.levels.take(result.index) + result = result.reindex(self.levels) + result.index.name = 'levels' + return result +##### The Series.cat accessor ##### + +class CategoricalProperties(PandasDelegate): + """ + Accessor object for categorical properties of the Series values. + + Examples + -------- + >>> s.cat.levels + >>> s.cat.levels = list('abc') + >>> s.cat.reorder_levels(list('cab')) + + Allows accessing to specific getter and access methods + """ + + def __init__(self, values, index): + self.categorical = values + self.index = index + + def _delegate_property_get(self, name): + return getattr(self.categorical, name) + + def _delegate_property_set(self, name, new_values): + return setattr(self.categorical, name, new_values) + + def _delegate_method(self, name, *args, **kwargs): + method = getattr(self.categorical, name) + return method(*args, **kwargs) + +CategoricalProperties._add_delegate_accessors(delegate=Categorical, + accessors=["levels", "ordered"], + typ='property') +CategoricalProperties._add_delegate_accessors(delegate=Categorical, + accessors=["reorder_levels", "remove_unused_levels"], + typ='method') + ##### utility routines ##### def _get_codes_for_values(values, levels): @@ -942,3 +1135,17 @@ def _get_codes_for_values(values, levels): t.map_locations(com._values_from_object(levels)) return com._ensure_platform_int(t.lookup(values)) +def _convert_to_list_like(list_like): + if hasattr(list_like, "dtype"): + return list_like + if isinstance(list_like, list): + return list_like + if (com._is_sequence(list_like) or isinstance(list_like, tuple) + or isinstance(list_like, types.GeneratorType)): + return list(list_like) + elif np.isscalar(list_like): + return [list_like] + else: + # is this reached? + return [list_like] + diff --git a/pandas/core/common.py b/pandas/core/common.py index 48fb75f59ac34..1fc0cd4101cf9 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -276,15 +276,22 @@ def _isnull_ndarraylike(obj): dtype = values.dtype if dtype.kind in ('O', 'S', 'U'): - # Working around NumPy ticket 1542 - shape = values.shape - - if dtype.kind in ('S', 'U'): - result = np.zeros(values.shape, dtype=bool) + if is_categorical_dtype(values): + from pandas import Categorical + if not isinstance(values, Categorical): + values = values.values + result = values.isnull() else: - result = np.empty(shape, dtype=bool) - vec = lib.isnullobj(values.ravel()) - result[...] = vec.reshape(shape) + + # Working around NumPy ticket 1542 + shape = values.shape + + if dtype.kind in ('S', 'U'): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj(values.ravel()) + result[...] = vec.reshape(shape) elif dtype in _DATELIKE_DTYPES: # this is the NaT pattern @@ -299,7 +306,6 @@ def _isnull_ndarraylike(obj): return result - def _isnull_ndarraylike_old(obj): values = getattr(obj, 'values', obj) dtype = values.dtype @@ -2440,7 +2446,7 @@ def _get_callable_name(obj): # instead of the empty string in this case to allow # distinguishing between no name and a name of '' return None - + _string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type, compat.text_type))) diff --git a/pandas/core/format.py b/pandas/core/format.py index 8f749d07296a7..0539d803a42a4 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -177,7 +177,7 @@ def _get_footer(self): # level infos are added to the end and in a new line, like it is done for Categoricals # Only added when we request a name if self.name and com.is_categorical_dtype(self.series.dtype): - level_info = self.series.cat._repr_level_info() + level_info = self.series.values._repr_level_info() if footer: footer += "\n" footer += level_info diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f3b8a54034d56..44ca001b65296 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1628,6 +1628,27 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) + def putmask(self, mask, new, align=True, inplace=False): + """ putmask the data to the block; it is possible that we may create a + new dtype of block + + return the resulting block(s) + + Parameters + ---------- + mask : the condition to respect + new : a ndarray/object + align : boolean, perform alignment on other/cond, default is True + inplace : perform inplace modification, default is False + + Returns + ------- + a new block(s), the result of the putmask + """ + new_values = self.values if inplace else self.values.copy() + new_values[mask] = new + return [self.make_block_same_class(values=new_values, placement=self.mgr_locs)] + def _astype(self, dtype, copy=False, raise_on_error=True, values=None, klass=None): """ diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 16e6e40802a95..fc51511ff3970 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -522,6 +522,10 @@ def _comp_method_SERIES(op, name, str_rep, masker=False): code duplication. """ def na_op(x, y): + if com.is_categorical_dtype(x) != com.is_categorical_dtype(y): + msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ + "compare values, use 'series np.asarray(cat)'." + raise TypeError(msg.format(op=op,typ=type(y))) if x.dtype == np.object_: if isinstance(y, list): y = lib.list_to_object_array(y) @@ -553,11 +557,16 @@ def wrapper(self, other): index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented - elif isinstance(other, (pa.Array, pd.Series, pd.Index)): + elif isinstance(other, (pa.Array, pd.Index)): if len(self) != len(other): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) + elif isinstance(other, pd.Categorical): + if not com.is_categorical_dtype(self): + msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\ + "If you want to compare values, use 'series np.asarray(other)'." + raise TypeError(msg.format(op=op,typ=self.dtype)) else: mask = isnull(self) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2f0e651bfc5b1..68f5b4d36392f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -906,7 +906,7 @@ def _repr_footer(self): # Categorical if com.is_categorical_dtype(self.dtype): - level_info = self.cat._repr_level_info() + level_info = self.values._repr_level_info() return u('%sLength: %d, dtype: %s\n%s') % (namestr, len(self), str(self.dtype.name), @@ -2390,11 +2390,12 @@ def dt(self): #------------------------------------------------------------------------------ # Categorical methods - @property + @cache_readonly def cat(self): + from pandas.core.categorical import CategoricalProperties if not com.is_categorical_dtype(self.dtype): raise TypeError("Can only use .cat accessor with a 'category' dtype") - return self.values + return CategoricalProperties(self.values, self.index) Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0}) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index d07adeadb640c..fcfee8cf9b1ba 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3,6 +3,7 @@ from datetime import datetime from pandas.compat import range, lrange, u import re +from distutils.version import LooseVersion import numpy as np import pandas as pd @@ -70,6 +71,18 @@ def test_constructor(self): c2 = Categorical(exp_arr, levels=["c","b","a"]) self.assert_numpy_array_equal(c2.__array__(), exp_arr) + # levels must be unique + def f(): + Categorical([1,2], [1,2,2]) + self.assertRaises(ValueError, f) + def f(): + Categorical(["a","b"], ["a","b","b"]) + self.assertRaises(ValueError, f) + def f(): + Categorical([1,2], [1,2,np.nan, np.nan]) + self.assertRaises(ValueError, f) + + # Categorical as input c1 = Categorical(["a", "b", "c", "a"]) c2 = Categorical(c1) @@ -110,6 +123,79 @@ def test_constructor(self): cat = pd.Categorical([1,2,3,np.nan], levels=[1,2,3]) self.assertTrue(com.is_integer_dtype(cat.levels)) + # https://github.com/pydata/pandas/issues/3678 + cat = pd.Categorical([np.nan,1, 2, 3]) + self.assertTrue(com.is_integer_dtype(cat.levels)) + + # this should result in floats + cat = pd.Categorical([np.nan, 1, 2., 3 ]) + self.assertTrue(com.is_float_dtype(cat.levels)) + + cat = pd.Categorical([np.nan, 1., 2., 3. ]) + self.assertTrue(com.is_float_dtype(cat.levels)) + + # preserve int as far as possible by converting to object if NaN is in levels + cat = pd.Categorical([np.nan, 1, 2, 3], levels=[np.nan, 1, 2, 3]) + self.assertTrue(com.is_object_dtype(cat.levels)) + # This doesn't work -> this would probably need some kind of "remember the original type" + # feature to try to cast the array interface result to... + #vals = np.asarray(cat[cat.notnull()]) + #self.assertTrue(com.is_integer_dtype(vals)) + cat = pd.Categorical([np.nan,"a", "b", "c"], levels=[np.nan,"a", "b", "c"]) + self.assertTrue(com.is_object_dtype(cat.levels)) + # but don't do it for floats + cat = pd.Categorical([np.nan, 1., 2., 3.], levels=[np.nan, 1., 2., 3.]) + self.assertTrue(com.is_float_dtype(cat.levels)) + + + # corner cases + cat = pd.Categorical([1]) + self.assertTrue(len(cat.levels) == 1) + self.assertTrue(cat.levels[0] == 1) + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + cat = pd.Categorical(["a"]) + self.assertTrue(len(cat.levels) == 1) + self.assertTrue(cat.levels[0] == "a") + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + # Scalars should be converted to lists + cat = pd.Categorical(1) + self.assertTrue(len(cat.levels) == 1) + self.assertTrue(cat.levels[0] == 1) + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + cat = pd.Categorical([1], levels=1) + self.assertTrue(len(cat.levels) == 1) + self.assertTrue(cat.levels[0] == 1) + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + def test_constructor_with_generator(self): + # This was raising an Error in isnull(single_val).any() because isnull returned a scalar + # for a generator + from pandas.compat import range as xrange + + exp = Categorical([0,1,2]) + cat = Categorical((x for x in [0,1,2])) + self.assertTrue(cat.equals(exp)) + cat = Categorical(xrange(3)) + self.assertTrue(cat.equals(exp)) + + # This uses xrange internally + from pandas.core.index import MultiIndex + MultiIndex.from_product([range(5), ['a', 'b', 'c']]) + + # check that levels accept generators and sequences + cat = pd.Categorical([0,1,2], levels=(x for x in [0,1,2])) + self.assertTrue(cat.equals(exp)) + cat = pd.Categorical([0,1,2], levels=xrange(3)) + self.assertTrue(cat.equals(exp)) + + def test_from_codes(self): # too few levels @@ -133,7 +219,7 @@ def f(): self.assertRaises(ValueError, f) - exp = Categorical(["a","b","c"]) + exp = Categorical(["a","b","c"], ordered=False) res = Categorical.from_codes([0,1,2], ["a","b","c"]) self.assertTrue(exp.equals(res)) @@ -178,6 +264,62 @@ def test_comparisons(self): expected = np.repeat(False, len(self.factor)) self.assert_numpy_array_equal(result, expected) + # comparisons with categoricals + cat_rev = pd.Categorical(["a","b","c"], levels=["c","b","a"]) + cat_rev_base = pd.Categorical(["b","b","b"], levels=["c","b","a"]) + cat = pd.Categorical(["a","b","c"]) + cat_base = pd.Categorical(["b","b","b"], levels=cat.levels) + + # comparisons need to take level ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = np.array([True, False, False]) + self.assert_numpy_array_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = np.array([False, False, True]) + self.assert_numpy_array_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = np.array([False, False, True]) + self.assert_numpy_array_equal(res, exp) + + # Only categories with same levels can be compared + def f(): + cat > cat_rev + self.assertRaises(TypeError, f) + + cat_rev_base2 = pd.Categorical(["b","b","b"], levels=["c","b","a","d"]) + def f(): + cat_rev > cat_rev_base2 + self.assertRaises(TypeError, f) + + # Only categories with same ordering information can be compared + cat_unorderd = cat.copy() + cat_unorderd.ordered = False + self.assertFalse((cat > cat).any()) + def f(): + cat > cat_unorderd + self.assertRaises(TypeError, f) + + # comparison (in both directions) with Series will raise + s = Series(["b","b","b"]) + self.assertRaises(TypeError, lambda: cat > s) + self.assertRaises(TypeError, lambda: cat_rev > s) + self.assertRaises(TypeError, lambda: s < cat) + self.assertRaises(TypeError, lambda: s < cat_rev) + + # comparison with numpy.array will raise in both direction, but only on newer + # numpy versions + a = np.array(["b","b","b"]) + self.assertRaises(TypeError, lambda: cat > a) + self.assertRaises(TypeError, lambda: cat_rev > a) + + # The following work via '__array_priority__ = 1000' + # works only on numpy >= 1.7.1 and not on PY3.2 + if LooseVersion(np.__version__) > "1.7.1" and not compat.PY3_2: + self.assertRaises(TypeError, lambda: a < cat) + self.assertRaises(TypeError, lambda: a < cat_rev) + def test_na_flags_int_levels(self): # #1457 @@ -204,6 +346,16 @@ def test_describe(self): ).set_index('levels') tm.assert_frame_equal(desc, expected) + # check unused levels + cat = self.factor.copy() + cat.levels = ["a","b","c","d"] + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[3, 2, 3, np.nan], + freqs=[3/8., 2/8., 3/8., np.nan], + levels=['a', 'b', 'c', 'd']) + ).set_index('levels') + tm.assert_frame_equal(desc, expected) + # check an integer one desc = Categorical([1,2,3,1,2,3,3,2,1,1,1]).describe() expected = DataFrame.from_dict(dict(counts=[5, 3, 3], @@ -213,6 +365,47 @@ def test_describe(self): ).set_index('levels') tm.assert_frame_equal(desc, expected) + # https://github.com/pydata/pandas/issues/3678 + # describe should work with NaN + cat = pd.Categorical([np.nan,1, 2, 2]) + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[1, 2, 1], + freqs=[1/4., 2/4., 1/4.], + levels=[1,2,np.nan] + ) + ).set_index('levels') + tm.assert_frame_equal(desc, expected) + + # having NaN as level and as "not available" should also print two NaNs in describe! + cat = pd.Categorical([np.nan,1, 2, 2]) + cat.levels = [1,2,np.nan] + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[1, 2, np.nan, 1], + freqs=[1/4., 2/4., np.nan, 1/4.], + levels=[1,2,np.nan,np.nan] + ) + ).set_index('levels') + tm.assert_frame_equal(desc, expected) + + # empty levels show up as NA + cat = Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True) + result = cat.describe() + + expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]], + columns=['counts','freqs'], + index=Index(['a','b','c'],name='levels')) + tm.assert_frame_equal(result,expected) + + # NA as a level + cat = pd.Categorical(["a","c","c",np.nan], levels=["b","a","c",np.nan] ) + result = cat.describe() + + expected = DataFrame([[np.nan, np.nan],[1,0.25],[2,0.5], [1,0.25]], + columns=['counts','freqs'], + index=Index(['b','a','c',np.nan],name='levels')) + tm.assert_frame_equal(result,expected) + + def test_print(self): expected = [" a", " b", " b", " a", " a", " c", " c", " c", "Levels (3, object): [a < b < c]"] @@ -361,6 +554,23 @@ def test_nan_handling(self): self.assert_numpy_array_equal(c.levels , np.array(["a","b",np.nan],dtype=np.object_)) self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0])) + def test_isnull(self): + exp = np.array([False, False, True]) + c = Categorical(["a","b",np.nan]) + res = c.isnull() + self.assert_numpy_array_equal(res, exp) + + c = Categorical(["a","b",np.nan], levels=["a","b",np.nan]) + res = c.isnull() + self.assert_numpy_array_equal(res, exp) + + exp = np.array([True, False, True]) + c = Categorical(["a","b",np.nan]) + c.levels = ["a","b",np.nan] + c[0] = np.nan + res = c.isnull() + self.assert_numpy_array_equal(res, exp) + def test_codes_immutable(self): # Codes should be read only @@ -492,6 +702,54 @@ def test_slicing_directly(self): self.assert_numpy_array_equal(sliced._codes, expected._codes) tm.assert_index_equal(sliced.levels, expected.levels) + def test_set_item_nan(self): + cat = pd.Categorical([1,2,3]) + exp = pd.Categorical([1,np.nan,3], levels=[1,2,3]) + cat[1] = np.nan + self.assertTrue(cat.equals(exp)) + + # if nan in levels, the proper code should be set! + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1] = np.nan + exp = np.array([0,3,2,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1:3] = np.nan + exp = np.array([0,3,3,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1:3] = [np.nan, 1] + exp = np.array([0,3,0,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1:3] = [np.nan, np.nan] + exp = np.array([0,3,3,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2, np.nan, 3], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[pd.isnull(cat)] = np.nan + exp = np.array([0,1,3,2]) + self.assert_numpy_array_equal(cat.codes, exp) + + def test_deprecated_labels(self): + # labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + exp = cat.codes + with tm.assert_produces_warning(FutureWarning): + res = cat.labels + self.assert_numpy_array_equal(res, exp) + self.assertFalse(LooseVersion(pd.__version__) >= '0.18') + + + class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True @@ -592,7 +850,7 @@ def test_creation_astype(self): def test_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either the series or the - # categorical should not change the values in the other one! + # categorical should not change the values in the other one, IF you specify copy! cat = Categorical(["a","b","c","a"]) s = pd.Series(cat, copy=True) self.assertFalse(s.cat is cat) @@ -612,7 +870,7 @@ def test_sideeffects_free(self): # so this WILL change values cat = Categorical(["a","b","c","a"]) s = pd.Series(cat) - self.assertTrue(s.cat is cat) + self.assertTrue(s.values is cat) s.cat.levels = [1,2,3] exp_s = np.array([1,2,3,1]) self.assert_numpy_array_equal(s.__array__(), exp_s) @@ -628,20 +886,32 @@ def test_nan_handling(self): # Nans are represented as -1 in labels s = Series(Categorical(["a","b",np.nan,"a"])) self.assert_numpy_array_equal(s.cat.levels, np.array(["a","b"])) - self.assert_numpy_array_equal(s.cat._codes, np.array([0,1,-1,0])) + self.assert_numpy_array_equal(s.values.codes, np.array([0,1,-1,0])) # If levels have nan included, the label should point to that instead s2 = Series(Categorical(["a","b",np.nan,"a"], levels=["a","b",np.nan])) self.assert_numpy_array_equal(s2.cat.levels, np.array(["a","b",np.nan], dtype=np.object_)) - self.assert_numpy_array_equal(s2.cat._codes, np.array([0,1,2,0])) + self.assert_numpy_array_equal(s2.values.codes, np.array([0,1,2,0])) # Changing levels should also make the replaced level np.nan s3 = Series(Categorical(["a","b","c","a"])) s3.cat.levels = ["a","b",np.nan] self.assert_numpy_array_equal(s3.cat.levels, np.array(["a","b",np.nan], dtype=np.object_)) - self.assert_numpy_array_equal(s3.cat._codes, np.array([0,1,2,0])) + self.assert_numpy_array_equal(s3.values.codes, np.array([0,1,2,0])) + + def test_cat_accessor(self): + s = Series(Categorical(["a","b",np.nan,"a"])) + self.assert_numpy_array_equal(s.cat.levels, np.array(["a","b"])) + self.assertEqual(s.cat.ordered, True) + exp = Categorical(["a","b",np.nan,"a"], levels=["b","a"]) + s.cat.reorder_levels(["b", "a"]) + self.assertTrue(s.values.equals(exp)) + exp = Categorical(["a","b",np.nan,"a"], levels=["b","a"]) + s[:] = "a" + s.cat.remove_unused_levels() + self.assert_numpy_array_equal(s.cat.levels, np.array(["a"])) def test_sequence_like(self): @@ -651,8 +921,8 @@ def test_sequence_like(self): df['grade'] = Categorical(df['raw_grade']) # basic sequencing testing - result = list(df.grade.cat) - expected = np.array(df.grade.cat).tolist() + result = list(df.grade.values) + expected = np.array(df.grade.values).tolist() tm.assert_almost_equal(result,expected) # iteration @@ -694,7 +964,7 @@ def test_series_delegations(self): exp_values = np.array(["a","b","c","a"]) s.cat.reorder_levels(["c","b","a"]) self.assert_numpy_array_equal(s.cat.levels, exp_levels) - self.assert_numpy_array_equal(s.cat.__array__(), exp_values) + self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) # remove unused levels @@ -703,7 +973,7 @@ def test_series_delegations(self): exp_values = np.array(["a","b","b","a"]) s.cat.remove_unused_levels() self.assert_numpy_array_equal(s.cat.levels, exp_levels) - self.assert_numpy_array_equal(s.cat.__array__(), exp_values) + self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) # This method is likely to be confused, so test that it raises an error on wrong inputs: @@ -749,10 +1019,6 @@ def test_assignment_to_dataframe(self): s.name = 'E' self.assertTrue(result2.sort_index().equals(s)) - # FIXME? - #### what does this compare to? ### - result = df.sort_index() - cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) df = pd.DataFrame(pd.Series(cat)) @@ -762,31 +1028,16 @@ def test_describe(self): result = self.cat.describe() self.assertEquals(len(result.columns),1) - # empty levels show up as NA - s = Series(Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True)) - result = s.cat.describe() - expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]], - columns=['counts','freqs'], - index=Index(['a','b','c'],name='levels')) - tm.assert_frame_equal(result,expected) + # In a frame, describe() for the cat should be the same as for string arrays (count, unique, + # top, freq) + cat = Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True) + s = Series(cat) result = s.describe() expected = Series([4,2,"b",3],index=['count','unique','top', 'freq']) tm.assert_series_equal(result,expected) - # NA as a level - cat = pd.Categorical(["a","c","c",np.nan], levels=["b","a","c",np.nan] ) - result = cat.describe() - - expected = DataFrame([[np.nan, np.nan],[1,0.25],[2,0.5], [1,0.25]], - columns=['counts','freqs'], - index=Index(['b','a','c',np.nan],name='levels')) - tm.assert_frame_equal(result,expected) - - - # In a frame, describe() for the cat should be the same as for string arrays (count, unique, - # top, freq) cat = pd.Series(pd.Categorical(["a","b","c","c"])) df3 = pd.DataFrame({"cat":cat, "s":["a","b","c","c"]}) res = df3.describe() @@ -966,7 +1217,7 @@ def test_sort(self): # Cats must be sorted in a dataframe res = df.sort(columns=["string"], ascending=False) exp = np.array(["d", "c", "b", "a"]) - self.assert_numpy_array_equal(res["sort"].cat.__array__(), exp) + self.assert_numpy_array_equal(res["sort"].values.__array__(), exp) self.assertEqual(res["sort"].dtype, "category") res = df.sort(columns=["sort"], ascending=False) @@ -1009,17 +1260,29 @@ def f(): res = cat.order(ascending=False, na_position='last') exp_val = np.array(["d","c","b","a", np.nan],dtype=object) exp_levels = np.array(["a","b","c","d"],dtype=object) - # FIXME: IndexError: Out of bounds on buffer access (axis 0) - #self.assert_numpy_array_equal(res.__array__(), exp_val) - #self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) cat = Categorical(["a","c","b","d", np.nan], ordered=True) res = cat.order(ascending=False, na_position='first') exp_val = np.array([np.nan, "d","c","b","a"],dtype=object) exp_levels = np.array(["a","b","c","d"],dtype=object) - # FIXME: IndexError: Out of bounds on buffer access (axis 0) - #self.assert_numpy_array_equal(res.__array__(), exp_val) - #self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) + + cat = Categorical(["a","c","b","d", np.nan], ordered=True) + res = cat.order(ascending=False, na_position='first') + exp_val = np.array([np.nan, "d","c","b","a"],dtype=object) + exp_levels = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) + + cat = Categorical(["a","c","b","d", np.nan], ordered=True) + res = cat.order(ascending=False, na_position='last') + exp_val = np.array(["d","c","b","a",np.nan],dtype=object) + exp_levels = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) def test_slicing(self): cat = Series(Categorical([1,2,3,4])) @@ -1253,6 +1516,12 @@ def test_assigning_ops(self): df.iloc[2,0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.iloc[df.index == "j",0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current level set def f(): df = orig.copy() @@ -1310,6 +1579,10 @@ def f(): df.loc["j","cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) + df = orig.copy() + df.loc[df.index == "j","cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + # - assign a single value not in the current level set def f(): df = orig.copy() @@ -1367,6 +1640,10 @@ def f(): df.ix["j",0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) + df = orig.copy() + df.ix[df.index == "j",0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + # - assign a single value not in the current level set def f(): df = orig.copy() @@ -1469,6 +1746,92 @@ def f(): df.loc[2:3,"b"] = pd.Categorical(["b","b"], levels=["a","b"]) tm.assert_frame_equal(df, exp) + ######### Series ########## + orig = Series(pd.Categorical(["b","b"], levels=["a","b"])) + s = orig.copy() + s[:] = "a" + exp = Series(pd.Categorical(["a","a"], levels=["a","b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[1] = "a" + exp = Series(pd.Categorical(["b","a"], levels=["a","b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[s.index > 0] = "a" + exp = Series(pd.Categorical(["b","a"], levels=["a","b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[[False, True]] = "a" + exp = Series(pd.Categorical(["b","a"], levels=["a","b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.index = ["x", "y"] + s["y"] = "a" + exp = Series(pd.Categorical(["b","a"], levels=["a","b"]), index=["x", "y"]) + tm.assert_series_equal(s, exp) + + # ensure that one can set something to np.nan + s = Series(Categorical([1,2,3])) + exp = Series(Categorical([1,np.nan,3])) + s[1] = np.nan + tm.assert_series_equal(s, exp) + + + def test_comparisons(self): + tests_data = [(list("abc"), list("cba"), list("bbb")), + ([1,2,3], [3,2,1], [2,2,2])] + for data , reverse, base in tests_data: + cat_rev = pd.Series(pd.Categorical(data, levels=reverse)) + cat_rev_base = pd.Series(pd.Categorical(base, levels=reverse)) + cat = pd.Series(pd.Categorical(data)) + cat_base = pd.Series(pd.Categorical(base, levels=cat.cat.levels)) + s = Series(base) + a = np.array(base) + + # comparisons need to take level ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = Series([True, False, False]) + tm.assert_series_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = Series([False, False, True]) + tm.assert_series_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = Series([False, False, True]) + tm.assert_series_equal(res, exp) + + # Only categories with same levels can be compared + def f(): + cat > cat_rev + self.assertRaises(TypeError, f) + + # categorical cannot be compared to Series or numpy array, and also not the other way + # around + self.assertRaises(TypeError, lambda: cat > s) + self.assertRaises(TypeError, lambda: cat_rev > s) + self.assertRaises(TypeError, lambda: cat > a) + self.assertRaises(TypeError, lambda: cat_rev > a) + + self.assertRaises(TypeError, lambda: s < cat) + self.assertRaises(TypeError, lambda: s < cat_rev) + + self.assertRaises(TypeError, lambda: a < cat) + self.assertRaises(TypeError, lambda: a < cat_rev) + + # Categoricals can be compared to scalar values + res = cat_rev > base[0] + tm.assert_series_equal(res, exp) + + # And test NaN handling... + cat = pd.Series(pd.Categorical(["a","b","c", np.nan])) + exp = Series([True, True, True, False]) + res = (cat == cat) + tm.assert_series_equal(res, exp) def test_concat(self): cat = pd.Categorical(["a","b"], levels=["a","b"]) @@ -1558,6 +1921,16 @@ def f(): res = df.dropna() tm.assert_frame_equal(res, df_exp_drop_all) + # make sure that fillna takes both missing values and NA levels into account + c = Categorical(["a","b",np.nan]) + c.levels = ["a","b",np.nan] + c[0] = np.nan + df = pd.DataFrame({"cats":c, "vals":[1,2,3]}) + df_exp = pd.DataFrame({"cats": Categorical(["a","b","a"]), "vals": [1,2,3]}) + res = df.fillna("a") + tm.assert_frame_equal(res, df_exp) + + def test_astype_to_other(self): s = self.cat['value_group'] @@ -1607,6 +1980,18 @@ def test_numeric_like_ops(self): # invalid ufunc self.assertRaises(TypeError, lambda : np.log(s)) + def test_cat_tab_completition(self): + # test the tab completion display + ok_for_cat = ['levels','ordered','reorder_levels','remove_unused_levels'] + def get_dir(s): + results = [ r for r in s.cat.__dir__() if not r.startswith('_') ] + return list(sorted(set(results))) + + s = Series(list('aabbcde')).astype('category') + results = get_dir(s) + tm.assert_almost_equal(results,list(sorted(set(ok_for_cat)))) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],