diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c69de149a0f35..8d26c51ae6527 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -178,9 +178,18 @@ Reshaping - - +ExtensionArray +^^^^^^^^^^^^^^ + +- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) +- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) +- +- + Other ^^^^^ - - - +- diff --git a/pandas/core/series.py b/pandas/core/series.py index d8bdd9ac9ed22..0564cdbbb2014 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2204,7 +2204,7 @@ def _binop(self, other, func, level=None, fill_value=None): result.name = None return result - def combine(self, other, func, fill_value=np.nan): + def combine(self, other, func, fill_value=None): """ Perform elementwise binary operation on two Series using given function with optional fill value when an index is missing from one Series or @@ -2216,6 +2216,8 @@ def combine(self, other, func, fill_value=np.nan): func : function Function that takes two scalars as inputs and return a scalar fill_value : scalar value + The default specifies to use the appropriate NaN value for + the underlying dtype of the Series Returns ------- @@ -2235,20 +2237,38 @@ def combine(self, other, func, fill_value=np.nan): Series.combine_first : Combine Series values, choosing the calling Series's values first """ + if fill_value is None: + fill_value = na_value_for_dtype(self.dtype, compat=False) + if isinstance(other, Series): + # If other is a Series, result is based on union of Series, + # so do this element by element new_index = self.index.union(other.index) new_name = ops.get_op_result_name(self, other) - new_values = np.empty(len(new_index), dtype=self.dtype) - for i, idx in enumerate(new_index): + new_values = [] + for idx in new_index: lv = self.get(idx, fill_value) rv = other.get(idx, fill_value) with np.errstate(all='ignore'): - new_values[i] = func(lv, rv) + new_values.append(func(lv, rv)) else: + # Assume that other is a scalar, so apply the function for + # each element in the Series new_index = self.index with np.errstate(all='ignore'): - new_values = func(self._values, other) + new_values = [func(lv, other) for lv in self._values] new_name = self.name + + if is_categorical_dtype(self.values): + pass + elif is_extension_array_dtype(self.values): + # The function can return something of any type, so check + # if the type is compatible with the calling EA + try: + new_values = self._values._from_sequence(new_values) + except TypeError: + pass + return self._constructor(new_values, index=new_index, name=new_name) def combine_first(self, other): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c5436aa731d50..23227867ee4d7 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -103,3 +103,37 @@ def test_factorize_equivalence(self, data_for_grouping, na_sentinel): tm.assert_numpy_array_equal(l1, l2) self.assert_extension_array_equal(u1, u2) + + def test_combine_le(self, data_repeated): + # GH 20825 + # Test that combine works when doing a <= (le) comparison + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 <= x2) + expected = pd.Series([a <= b for (a, b) in + zip(list(orig_data1), list(orig_data2))]) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 <= x2) + expected = pd.Series([a <= val for a in list(orig_data1)]) + self.assert_series_equal(result, expected) + + def test_combine_add(self, data_repeated): + # GH 20825 + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 + x2) + expected = pd.Series( + orig_data1._from_sequence([a + b for (a, b) in + zip(list(orig_data1), + list(orig_data2))])) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 + x2) + expected = pd.Series( + orig_data1._from_sequence([a + val for a in list(orig_data1)])) + self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 530a4e7a22a7a..61fdb8454b542 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -1,6 +1,7 @@ import string import pytest +import pandas as pd import numpy as np from pandas.api.types import CategoricalDtype @@ -29,6 +30,15 @@ def data_missing(): return Categorical([np.nan, 'A']) +@pytest.fixture +def data_repeated(): + """Return different versions of data for count times""" + def gen(count): + for _ in range(count): + yield Categorical(make_data()) + yield gen + + @pytest.fixture def data_for_sorting(): return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'], @@ -154,6 +164,22 @@ class TestMethods(base.BaseMethodsTests): def test_value_counts(self, all_data, dropna): pass + def test_combine_add(self, data_repeated): + # GH 20825 + # When adding categoricals in combine, result is a string + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 + x2) + expected = pd.Series(([a + b for (a, b) in + zip(list(orig_data1), list(orig_data2))])) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 + x2) + expected = pd.Series([a + val for a in list(orig_data1)]) + self.assert_series_equal(result, expected) + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index bbd31c4071b91..4bbbb7df2f399 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -30,6 +30,15 @@ def all_data(request, data, data_missing): return data_missing +@pytest.fixture +def data_repeated(): + """Return different versions of data for count times""" + def gen(count): + for _ in range(count): + yield NotImplementedError + yield gen + + @pytest.fixture def data_for_sorting(): """Length-3 array with a known sort order. diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 90f0181beab0d..cc6fadc483d5e 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -28,7 +28,9 @@ class DecimalArray(ExtensionArray): dtype = DecimalDtype() def __init__(self, values): - assert all(isinstance(v, decimal.Decimal) for v in values) + for val in values: + if not isinstance(val, self.dtype.type): + raise TypeError values = np.asarray(values, dtype=object) self._data = values diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 1f8cf0264f62f..f74b4d7e94f11 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -25,6 +25,14 @@ def data_missing(): return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) +@pytest.fixture +def data_repeated(): + def gen(count): + for _ in range(count): + yield DecimalArray(make_data()) + yield gen + + @pytest.fixture def data_for_sorting(): return DecimalArray([decimal.Decimal('1'), diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index b7ac8033f3f6d..85a282ae4007f 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -187,6 +187,14 @@ def test_sort_values_missing(self, data_missing_for_sorting, ascending): super(TestMethods, self).test_sort_values_missing( data_missing_for_sorting, ascending) + @pytest.mark.skip(reason="combine for JSONArray not supported") + def test_combine_le(self, data_repeated): + pass + + @pytest.mark.skip(reason="combine for JSONArray not supported") + def test_combine_add(self, data_repeated): + pass + class TestCasting(BaseJSON, base.BaseCastingTests): @pytest.mark.xfail diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 6cf60e818c845..f35cce6ac9d71 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -60,6 +60,19 @@ def test_append_duplicates(self): with tm.assert_raises_regex(ValueError, msg): pd.concat([s1, s2], verify_integrity=True) + def test_combine_scalar(self): + # GH 21248 + # Note - combine() with another Series is tested elsewhere because + # it is used when testing operators + s = pd.Series([i * 10 for i in range(5)]) + result = s.combine(3, lambda x, y: x + y) + expected = pd.Series([i * 10 + 3 for i in range(5)]) + tm.assert_series_equal(result, expected) + + result = s.combine(22, lambda x, y: min(x, y)) + expected = pd.Series([min(i * 10, 22) for i in range(5)]) + tm.assert_series_equal(result, expected) + def test_combine_first(self): values = tm.makeIntIndex(20).values.astype(float) series = Series(values, index=tm.makeIntIndex(20))