pandas-dev · jreback · Feb 12, 2021 · Feb 3, 2021 · Feb 7, 2021 · Feb 7, 2021
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -61,7 +61,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
     return arr.astype(dtype, copy=False)
 
 
-def concat_compat(to_concat, axis: int = 0):
+def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False):
     """
     provide concatenation of an array of arrays each of which is a single
     'normalized' dtypes (in that for example, if it's object, then it is a
@@ -72,6 +72,9 @@ def concat_compat(to_concat, axis: int = 0):
     ----------
     to_concat : array of arrays
     axis : axis to provide concatenation
+    ea_compat_axis : bool, default False
+        For ExtensionArray compat, behave as if axis == 1 when determining
+        whether to drop empty arrays.
 
     Returns
     -------
@@ -91,7 +94,8 @@ def is_nonempty(x) -> bool:
     # marginal given that it would still require shape & dtype calculation and
     # np.concatenate which has them both implemented is compiled.
     non_empties = [x for x in to_concat if is_nonempty(x)]
-    if non_empties and axis == 0:
+    if non_empties and axis == 0 and not ea_compat_axis:
+        # ea_compat_axis see GH#39574
         to_concat = non_empties
 
     kinds = {obj.dtype.kind for obj in to_concat}

diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -1,9 +1,8 @@
 from __future__ import annotations
 
-from collections import defaultdict
 import copy
 import itertools
-from typing import TYPE_CHECKING, Dict, List, Sequence, cast
+from typing import TYPE_CHECKING, Dict, List, Sequence
 
 import numpy as np
 
@@ -15,26 +14,21 @@
 from pandas.core.dtypes.common import (
     get_dtype,
     is_categorical_dtype,
-    is_datetime64_dtype,
     is_datetime64tz_dtype,
     is_extension_array_dtype,
-    is_float_dtype,
-    is_numeric_dtype,
     is_sparse,
-    is_timedelta64_dtype,
 )
 from pandas.core.dtypes.concat import concat_compat
-from pandas.core.dtypes.missing import isna_all
+from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna_all
 
 import pandas.core.algorithms as algos
-from pandas.core.arrays import DatetimeArray, ExtensionArray
+from pandas.core.arrays import DatetimeArray, ExtensionArray, TimedeltaArray
 from pandas.core.internals.array_manager import ArrayManager
 from pandas.core.internals.blocks import make_block
 from pandas.core.internals.managers import BlockManager
 
 if TYPE_CHECKING:
     from pandas import Index
-    from pandas.core.arrays.sparse.dtype import SparseDtype
 
 
 def concatenate_block_managers(
@@ -233,6 +227,24 @@ def dtype(self):
         else:
             return get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0])
 
+    def is_valid_na_for(self, dtype: DtypeObj) -> bool:
+        """
+        Check that we are all-NA of a type/dtype that is compatible with this dtype.
+        """
+        if not self.is_na:
+            return False
+        if self.block is None:
+            return True
+
+        if self.dtype == object:
+            values = self.block.values
+            return all(
+                is_valid_nat_for_dtype(x, dtype) for x in values.ravel(order="K")
+            )
+
+        na_value = self.block.fill_value
+        return is_valid_nat_for_dtype(na_value, dtype)
+
     @cache_readonly
     def is_na(self) -> bool:
         if self.block is None:
@@ -263,7 +275,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
         else:
             fill_value = upcasted_na
 
-            if self.is_na:
+            if self.is_valid_na_for(empty_dtype):
                 blk_dtype = getattr(self.block, "dtype", None)
 
                 if blk_dtype == np.dtype(object):
@@ -296,6 +308,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
                         empty_arr, allow_fill=True, fill_value=fill_value
                     )
                 else:
+                    # NB: we should never get here with empty_dtype integer or bool;
+                    #  if we did, the missing_arr.fill would cast to gibberish
                     missing_arr = np.empty(self.shape, dtype=empty_dtype)
                     missing_arr.fill(fill_value)
                     return missing_arr
@@ -363,9 +377,11 @@ def _concatenate_join_units(
         # concatting with at least one EA means we are concatting a single column
         # the non-EA values are 2D arrays with shape (1, n)
         to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
-        concat_values = concat_compat(to_concat, axis=0)
-        if not isinstance(concat_values, ExtensionArray) or (
-            isinstance(concat_values, DatetimeArray) and concat_values.tz is None
+        concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
+        if (
+            not isinstance(concat_values, ExtensionArray)
+            or (isinstance(concat_values, DatetimeArray) and concat_values.tz is None)
+            or isinstance(concat_values, TimedeltaArray)
         ):
             # if the result of concat is not an EA but an ndarray, reshape to
             # 2D to put it a non-EA Block
@@ -420,108 +436,23 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
         return empty_dtype
 
     has_none_blocks = any(unit.block is None for unit in join_units)
-    dtypes = [None if unit.block is None else unit.dtype for unit in join_units]
 
-    filtered_dtypes = [
+    dtypes = [
         unit.dtype for unit in join_units if unit.block is not None and not unit.is_na
     ]
-    if not len(filtered_dtypes):
-        filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None]
-    dtype_alt = find_common_type(filtered_dtypes)
-
-    upcast_classes = _get_upcast_classes(join_units, dtypes)
-
-    if is_extension_array_dtype(dtype_alt):
-        return dtype_alt
-    elif dtype_alt == object:
-        return dtype_alt
-
-    # TODO: de-duplicate with maybe_promote?
-    # create the result
-    if "extension" in upcast_classes:
-        return np.dtype("object")
-    elif "bool" in upcast_classes:
-        if has_none_blocks:
-            return np.dtype(np.object_)
-        else:
-            return np.dtype(np.bool_)
-    elif "datetimetz" in upcast_classes:
-        # GH-25014. We use NaT instead of iNaT, since this eventually
-        # ends up in DatetimeArray.take, which does not allow iNaT.
-        dtype = upcast_classes["datetimetz"]
-        return dtype[0]
-    elif "datetime" in upcast_classes:
-        return np.dtype("M8[ns]")
-    elif "timedelta" in upcast_classes:
-        return np.dtype("m8[ns]")
-    else:
-        try:
-            common_dtype = np.find_common_type(upcast_classes, [])
-        except TypeError:
-            # At least one is an ExtensionArray
-            return np.dtype(np.object_)
-        else:
-            if is_float_dtype(common_dtype):
-                return common_dtype
-            elif is_numeric_dtype(common_dtype):
-                if has_none_blocks:
-                    return np.dtype(np.float64)
-                else:
-                    return common_dtype
-
-    msg = "invalid dtype determination in get_concat_dtype"
-    raise AssertionError(msg)
-
-
-def _get_upcast_classes(
-    join_units: Sequence[JoinUnit],
-    dtypes: Sequence[DtypeObj],
-) -> Dict[str, List[DtypeObj]]:
-    """Create mapping between upcast class names and lists of dtypes."""
-    upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
-    null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
-    for dtype, unit in zip(dtypes, join_units):
-        if dtype is None:
-            continue
-
-        upcast_cls = _select_upcast_cls_from_dtype(dtype)
-        # Null blocks should not influence upcast class selection, unless there
-        # are only null blocks, when same upcasting rules must be applied to
-        # null upcast classes.
-        if unit.is_na:
-            null_upcast_classes[upcast_cls].append(dtype)
-        else:
-            upcast_classes[upcast_cls].append(dtype)
-
-    if not upcast_classes:
-        upcast_classes = null_upcast_classes
-
-    return upcast_classes
-
-
-def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str:
-    """Select upcast class name based on dtype."""
-    if is_categorical_dtype(dtype):
-        return "extension"
-    elif is_datetime64tz_dtype(dtype):
-        return "datetimetz"
-    elif is_extension_array_dtype(dtype):
-        return "extension"
-    elif issubclass(dtype.type, np.bool_):
-        return "bool"
-    elif issubclass(dtype.type, np.object_):
-        return "object"
-    elif is_datetime64_dtype(dtype):
-        return "datetime"
-    elif is_timedelta64_dtype(dtype):
-        return "timedelta"
-    elif is_sparse(dtype):
-        dtype = cast("SparseDtype", dtype)
-        return dtype.subtype.name
-    elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
-        return dtype.name
-    else:
-        return "float"
+    if not len(dtypes):
+        dtypes = [unit.dtype for unit in join_units if unit.block is not None]
+
+    dtype = find_common_type(dtypes)
+    if has_none_blocks:
+        if not isinstance(dtype, np.dtype):
+            # EA dtype
+            pass
+        elif dtype.kind in ["i", "u"]:
+            dtype = np.dtype(np.float64)
+        elif dtype.kind == "b":
+            dtype = np.dtype(object)
+    return dtype
 
 
 def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool:

diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py
@@ -334,29 +334,34 @@ def test_append_missing_column_proper_upcast(self, sort):
     def test_append_empty_frame_to_series_with_dateutil_tz(self):
         # GH 23682
         date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
-        s = Series({"date": date, "a": 1.0, "b": 2.0})
+        ser = Series({"date": date, "a": 1.0, "b": 2.0})
         df = DataFrame(columns=["c", "d"])
-        result_a = df.append(s, ignore_index=True)
+        result_a = df.append(ser, ignore_index=True)
         expected = DataFrame(
             [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
         )
         # These columns get cast to object after append
         expected["c"] = expected["c"].astype(object)
         expected["d"] = expected["d"].astype(object)
+        expected["date"] = expected["date"].astype(object)
+        # TODO: "date" might make sense to keep as dt64tz
         tm.assert_frame_equal(result_a, expected)
 
         expected = DataFrame(
             [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
         )
         expected["c"] = expected["c"].astype(object)
         expected["d"] = expected["d"].astype(object)
-
-        result_b = result_a.append(s, ignore_index=True)
+        expected["date"] = expected["date"].astype(object)
+        # TODO: "date" might make sense to keep as dt64tz
+        result_b = result_a.append(ser, ignore_index=True)
         tm.assert_frame_equal(result_b, expected)
 
         # column order is different
         expected = expected[["c", "d", "date", "a", "b"]]
-        result = df.append([s, s], ignore_index=True)
+        dtype = Series([date]).dtype
+        expected["date"] = expected["date"].astype(dtype)
+        result = df.append([ser, ser], ignore_index=True)
         tm.assert_frame_equal(result, expected)
 
     def test_append_empty_tz_frame_with_datetime64ns(self):
@@ -378,12 +383,27 @@ def test_append_empty_tz_frame_with_datetime64ns(self):
     @pytest.mark.parametrize(
         "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
     )
-    def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str):
+    @pytest.mark.parametrize("val", [1, "NaT"])
+    def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val):
         # https://github.com/pandas-dev/pandas/issues/35460
         df = DataFrame(columns=["a"]).astype(dtype_str)
 
-        other = DataFrame({"a": [np.timedelta64("NaT", "ns")]})
+        other = DataFrame({"a": [np.timedelta64(val, "ns")]})
         result = df.append(other, ignore_index=True)
 
         expected = other.astype(object)
         tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
+    )
+    @pytest.mark.parametrize("val", [1, "NaT"])
+    def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
+        # https://github.com/pandas-dev/pandas/issues/35460
+        df = DataFrame({"a": pd.array([1], dtype=dtype_str)})
+
+        other = DataFrame({"a": [np.timedelta64(val, "ns")]})
+        result = df.append(other, ignore_index=True)
+
+        expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py
@@ -528,10 +528,8 @@ def test_merge_datetime_multi_index_empty_df(self, merge_type):
         tm.assert_frame_equal(results_merge, expected)
         tm.assert_frame_equal(results_join, expected)
 
-    def test_join_multi_levels(self):
-
-        # GH 3662
-        # merge multi-levels
+    @pytest.fixture
+    def household(self):
         household = DataFrame(
             {
                 "household_id": [1, 2, 3],
@@ -540,6 +538,10 @@ def test_join_multi_levels(self):
             },
             columns=["household_id", "male", "wealth"],
         ).set_index("household_id")
+        return household
+
+    @pytest.fixture
+    def portfolio(self):
         portfolio = DataFrame(
             {
                 "household_id": [1, 2, 2, 3, 3, 3, 4],
@@ -565,7 +567,10 @@ def test_join_multi_levels(self):
             },
             columns=["household_id", "asset_id", "name", "share"],
         ).set_index(["household_id", "asset_id"])
-        result = household.join(portfolio, how="inner")
+        return portfolio
+
+    @pytest.fixture
+    def expected(self):
         expected = (
             DataFrame(
                 {
@@ -601,8 +606,21 @@ def test_join_multi_levels(self):
             .set_index(["household_id", "asset_id"])
             .reindex(columns=["male", "wealth", "name", "share"])
         )
+        return expected
+
+    def test_join_multi_levels(self, portfolio, household, expected):
+        portfolio = portfolio.copy()
+        household = household.copy()
+
+        # GH 3662
+        # merge multi-levels
+        result = household.join(portfolio, how="inner")
         tm.assert_frame_equal(result, expected)
 
+    def test_join_multi_levels_merge_equivalence(self, portfolio, household, expected):
+        portfolio = portfolio.copy()
+        household = household.copy()
+
         # equivalency
         result = merge(
             household.reset_index(),
@@ -612,6 +630,10 @@ def test_join_multi_levels(self):
         ).set_index(["household_id", "asset_id"])
         tm.assert_frame_equal(result, expected)
 
+    def test_join_multi_levels_outer(self, portfolio, household, expected):
+        portfolio = portfolio.copy()
+        household = household.copy()
+
         result = household.join(portfolio, how="outer")
         expected = concat(
             [
@@ -630,6 +652,10 @@ def test_join_multi_levels(self):
         ).reindex(columns=expected.columns)
         tm.assert_frame_equal(result, expected)
 
+    def test_join_multi_levels_invalid(self, portfolio, household):
+        portfolio = portfolio.copy()
+        household = household.copy()
+
         # invalid cases
         household.index.name = "foo"