Skip to content

Commit

Permalink
BUG: merging an Integer EA rasises (#23262)
Browse files Browse the repository at this point in the history
  • Loading branch information
makbigc authored and jreback committed Dec 19, 2018
1 parent c230f29 commit 14c33b0
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 8 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1009,6 +1009,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
- :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`).
- :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`).
- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`).
- Bug in :func:`pandas.merge` when merging on an extension array-backed column (:issue:`23020`).
- A default repr for :class:`pandas.api.extensions.ExtensionArray` is now provided (:issue:`23601`).

.. _whatsnew_0240.api.incompatibilities:
Expand Down
23 changes: 15 additions & 8 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
from pandas.core.dtypes.common import (
ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool,
is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_float_dtype,
is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number,
is_numeric_dtype, needs_i8_conversion)
is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
needs_i8_conversion)
from pandas.core.dtypes.missing import isnull, na_value_for_dtype

from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
Expand Down Expand Up @@ -1589,25 +1590,31 @@ def _right_outer_join(x, y, max_groups):


def _factorize_keys(lk, rk, sort=True):
# Some pre-processing for non-ndarray lk / rk
if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
lk = lk.values
rk = rk.values

# if we exactly match in categories, allow us to factorize on codes
if (is_categorical_dtype(lk) and
elif (is_categorical_dtype(lk) and
is_categorical_dtype(rk) and
lk.is_dtype_equal(rk)):
klass = libhashtable.Int64Factorizer

if lk.categories.equals(rk.categories):
# if we exactly match in categories, allow us to factorize on codes
rk = rk.codes
else:
# Same categories in different orders -> recode
rk = _recode_for_categories(rk.codes, rk.categories, lk.categories)

lk = ensure_int64(lk.codes)
rk = ensure_int64(rk)
elif is_integer_dtype(lk) and is_integer_dtype(rk):

elif (is_extension_array_dtype(lk.dtype) and
is_extension_array_dtype(rk.dtype) and
lk.dtype == rk.dtype):
lk, _ = lk._values_for_factorize()
rk, _ = rk._values_for_factorize()

if is_integer_dtype(lk) and is_integer_dtype(rk):
# GH#23917 TODO: needs tests for case where lk is integer-dtype
# and rk is datetime-dtype
klass = libhashtable.Int64Factorizer
Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/extension/base/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,38 @@ def test_merge(self, data, na_value):
dtype=data.dtype)})
self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])

def test_merge_on_extension_array(self, data):
# GH 23020
a, b = data[:2]
key = type(data)._from_sequence([a, b], dtype=data.dtype)

df = pd.DataFrame({"key": key, "val": [1, 2]})
result = pd.merge(df, df, on='key')
expected = pd.DataFrame({"key": key,
"val_x": [1, 2],
"val_y": [1, 2]})
self.assert_frame_equal(result, expected)

# order
result = pd.merge(df.iloc[[1, 0]], df, on='key')
expected = expected.iloc[[1, 0]].reset_index(drop=True)
self.assert_frame_equal(result, expected)

def test_merge_on_extension_array_duplicates(self, data):
# GH 23020
a, b = data[:2]
key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})

result = pd.merge(df1, df2, on='key')
expected = pd.DataFrame({
"key": key.take([0, 0, 0, 0, 1]),
"val_x": [1, 1, 3, 3, 2],
"val_y": [1, 3, 1, 3, 2],
})
self.assert_frame_equal(result, expected)

@pytest.mark.parametrize("columns", [
["A", "B"],
pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b')],
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1326,6 +1326,16 @@ def test_merging_with_bool_or_int_cateorical_column(self, category_column,
CDT(categories, ordered=ordered))
assert_frame_equal(expected, result)

def test_merge_on_int_array(self):
# GH 23020
df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
'B': 1})
result = pd.merge(df, df, on='A')
expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
'B_x': 1,
'B_y': 1})
assert_frame_equal(result, expected)


@pytest.fixture
def left_df():
Expand Down

0 comments on commit 14c33b0

Please sign in to comment.