BUG: fix bug where appending unordered CategoricalIndex variables…

… overrides index (pandas-dev#24845)
GivyBoy · Aug 12, 2022 · 80c8c20 · 80c8c20
1 parent a62897a
commit 80c8c20
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 3 deletions.
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -390,6 +390,47 @@ upon serialization. (Related issue :issue:`12997`)
     # Roundtripping now works
     pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
 
+.. _whatsnew_150.notable_bug_fixes.groupby_value_counts_categorical:
+
+DataFrameGroupBy.value_counts with non-grouping categorical columns and ``observed=True``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Calling :meth:`.DataFrameGroupBy.value_counts` with ``observed=True`` would incorrectly drop non-observed categories of non-grouping columns (:issue:`46357`).
+
+.. code-block:: ipython
+
+    In [6]: df = pd.DataFrame(["a", "b", "c"], dtype="category").iloc[0:2]
+    In [7]: df
+    Out[7]:
+       0
+    0  a
+    1  b
+
+*Old Behavior*
+
+.. code-block:: ipython
+
+    In [8]: df.groupby(level=0, observed=True).value_counts()
+    Out[8]:
+    0  a    1
+    1  b    1
+    dtype: int64
+
+
+*New Behavior*
+
+.. code-block:: ipython
+
+    In [9]: df.groupby(level=0, observed=True).value_counts()
+    Out[9]:
+    0  a    1
+    1  a    0
+       b    1
+    0  b    0
+       c    0
+    1  c    0
+    dtype: int64
+
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.api_breaking:
 
@@ -814,7 +855,7 @@ Categorical
 ^^^^^^^^^^^
 - Bug in :meth:`.Categorical.view` not accepting integer dtypes (:issue:`25464`)
 - Bug in :meth:`.CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`)
--
+- Bug in :meth:`DataFrame.concat` when concatenating two (or more) unordered ``CategoricalIndex`` variables, whose categories are permutations, yields incorrect index values (:issue:`24845`)
 
 Datetimelike
 ^^^^^^^^^^^^

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -572,13 +572,14 @@ def map(self, mapper):
     def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
         # if calling index is category, don't check dtype of others
         try:
-            codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat])
+            cat = Categorical._concat_same_type(
+                [self._is_dtype_compat(c) for c in to_concat]
+            )
         except TypeError:
             # not all to_concat elements are among our categories (or NA)
             from pandas.core.dtypes.concat import concat_compat
 
             res = concat_compat([x._values for x in to_concat])
             return Index(res, name=name)
         else:
-            cat = self._data._from_backing_data(codes)
             return type(self)._simple_new(cat, name=name)
diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py
@@ -238,3 +238,19 @@ def test_categorical_missing_from_one_frame(self):
             index=[0, 1, 2, 0, 1, 2],
         )
         tm.assert_frame_equal(result, expected)
+
+    def test_concat_categorical_same_categories_different_order(self):
+        # https://github.com/pandas-dev/pandas/issues/24845
+
+        c1 = pd.CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False)
+        c2 = pd.CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False)
+        c3 = pd.CategoricalIndex(
+            ["a", "a", "b", "b"], categories=["a", "b"], ordered=False
+        )
+
+        df1 = DataFrame({"A": [1, 2]}, index=c1)
+        df2 = DataFrame({"A": [3, 4]}, index=c2)
+
+        result = pd.concat((df1, df2))
+        expected = DataFrame({"A": [1, 2, 3, 4]}, index=c3)
+        tm.assert_frame_equal(result, expected)