diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 7f07187e34c78..2788c91719eab 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -390,6 +390,47 @@ upon serialization. (Related issue :issue:`12997`) # Roundtripping now works pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index +.. _whatsnew_150.notable_bug_fixes.groupby_value_counts_categorical: + +DataFrameGroupBy.value_counts with non-grouping categorical columns and ``observed=True`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Calling :meth:`.DataFrameGroupBy.value_counts` with ``observed=True`` would incorrectly drop non-observed categories of non-grouping columns (:issue:`46357`). + +.. code-block:: ipython + + In [6]: df = pd.DataFrame(["a", "b", "c"], dtype="category").iloc[0:2] + In [7]: df + Out[7]: + 0 + 0 a + 1 b + +*Old Behavior* + +.. code-block:: ipython + + In [8]: df.groupby(level=0, observed=True).value_counts() + Out[8]: + 0 a 1 + 1 b 1 + dtype: int64 + + +*New Behavior* + +.. code-block:: ipython + + In [9]: df.groupby(level=0, observed=True).value_counts() + Out[9]: + 0 a 1 + 1 a 0 + b 1 + 0 b 0 + c 0 + 1 c 0 + dtype: int64 + .. --------------------------------------------------------------------------- .. _whatsnew_150.api_breaking: @@ -814,7 +855,7 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`.Categorical.view` not accepting integer dtypes (:issue:`25464`) - Bug in :meth:`.CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`) -- +- Bug in :meth:`DataFrame.concat` when concatenating two (or more) unordered ``CategoricalIndex`` variables, whose categories are permutations, yields incorrect index values (:issue:`24845`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c1ae3cb1b16ea..a39a8105bed71 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -572,7 +572,9 @@ def map(self, mapper): def _concat(self, to_concat: list[Index], name: Hashable) -> Index: # if calling index is category, don't check dtype of others try: - codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) + cat = Categorical._concat_same_type( + [self._is_dtype_compat(c) for c in to_concat] + ) except TypeError: # not all to_concat elements are among our categories (or NA) from pandas.core.dtypes.concat import concat_compat @@ -580,5 +582,4 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index: res = concat_compat([x._values for x in to_concat]) return Index(res, name=name) else: - cat = self._data._from_backing_data(codes) return type(self)._simple_new(cat, name=name) diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index 5bafd2e8e8503..f00d3b369a94d 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -238,3 +238,19 @@ def test_categorical_missing_from_one_frame(self): index=[0, 1, 2, 0, 1, 2], ) tm.assert_frame_equal(result, expected) + + def test_concat_categorical_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/24845 + + c1 = pd.CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False) + c2 = pd.CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False) + c3 = pd.CategoricalIndex( + ["a", "a", "b", "b"], categories=["a", "b"], ordered=False + ) + + df1 = DataFrame({"A": [1, 2]}, index=c1) + df2 = DataFrame({"A": [3, 4]}, index=c2) + + result = pd.concat((df1, df2)) + expected = DataFrame({"A": [1, 2, 3, 4]}, index=c3) + tm.assert_frame_equal(result, expected)