diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index fe5e4a57c557a..1fb43de5f4c5a 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1009,6 +1009,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`). - :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). - Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). +- Bug in :func:`pandas.merge` when merging on an extension array-backed column (:issue:`23020`). - A default repr for :class:`pandas.api.extensions.ExtensionArray` is now provided (:issue:`23601`). .. _whatsnew_0240.api.incompatibilities: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e6e6c1c99b509..0adeb7997a888 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -17,9 +17,10 @@ from pandas.core.dtypes.common import ( ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_float_dtype, - is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number, - is_numeric_dtype, needs_i8_conversion) + is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, + is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer, + is_integer_dtype, is_list_like, is_number, is_numeric_dtype, + needs_i8_conversion) from pandas.core.dtypes.missing import isnull, na_value_for_dtype from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta @@ -1589,17 +1590,16 @@ def _right_outer_join(x, y, max_groups): def _factorize_keys(lk, rk, sort=True): + # Some pre-processing for non-ndarray lk / rk if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): lk = lk.values rk = rk.values - # if we exactly match in categories, allow us to factorize on codes - if (is_categorical_dtype(lk) and + elif (is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk)): - klass = libhashtable.Int64Factorizer - if lk.categories.equals(rk.categories): + # if we exactly match in categories, allow us to factorize on codes rk = rk.codes else: # Same categories in different orders -> recode @@ -1607,7 +1607,14 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif is_integer_dtype(lk) and is_integer_dtype(rk): + + elif (is_extension_array_dtype(lk.dtype) and + is_extension_array_dtype(rk.dtype) and + lk.dtype == rk.dtype): + lk, _ = lk._values_for_factorize() + rk, _ = rk._values_for_factorize() + + if is_integer_dtype(lk) and is_integer_dtype(rk): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype klass = libhashtable.Int64Factorizer diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 42e481d974295..ee22ffb3ccf97 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -173,6 +173,38 @@ def test_merge(self, data, na_value): dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + def test_merge_on_extension_array(self, data): + # GH 23020 + a, b = data[:2] + key = type(data)._from_sequence([a, b], dtype=data.dtype) + + df = pd.DataFrame({"key": key, "val": [1, 2]}) + result = pd.merge(df, df, on='key') + expected = pd.DataFrame({"key": key, + "val_x": [1, 2], + "val_y": [1, 2]}) + self.assert_frame_equal(result, expected) + + # order + result = pd.merge(df.iloc[[1, 0]], df, on='key') + expected = expected.iloc[[1, 0]].reset_index(drop=True) + self.assert_frame_equal(result, expected) + + def test_merge_on_extension_array_duplicates(self, data): + # GH 23020 + a, b = data[:2] + key = type(data)._from_sequence([a, b, a], dtype=data.dtype) + df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) + df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) + + result = pd.merge(df1, df2, on='key') + expected = pd.DataFrame({ + "key": key.take([0, 0, 0, 0, 1]), + "val_x": [1, 1, 3, 3, 2], + "val_y": [1, 3, 1, 3, 2], + }) + self.assert_frame_equal(result, expected) + @pytest.mark.parametrize("columns", [ ["A", "B"], pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b')], diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 94e180f9328d6..970802e94662a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1326,6 +1326,16 @@ def test_merging_with_bool_or_int_cateorical_column(self, category_column, CDT(categories, ordered=ordered)) assert_frame_equal(expected, result) + def test_merge_on_int_array(self): + # GH 23020 + df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), + 'B': 1}) + result = pd.merge(df, df, on='A') + expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), + 'B_x': 1, + 'B_y': 1}) + assert_frame_equal(result, expected) + @pytest.fixture def left_df():