rapidsai · rapids-bot · Jun 15, 2021 · May 26, 2021 · May 26, 2021 · May 26, 2021
@@ -1506,27 +1506,23 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn:
             offset=codes_col.offset,
         )
 
-    def _copy_type_metadata(
-        self: CategoricalColumn, other: ColumnBase
-    ) -> ColumnBase:
-        """Copies type metadata from self onto other, returning a new column.
-
-        In addition to the default behavior, if `other` is not a
-        CategoricalColumn, we assume other is a column of codes, and return a
-        CategoricalColumn composed of `other`  and the categories of `self`.
-        """
-        if not isinstance(other, cudf.core.column.CategoricalColumn):
-            other = column.build_categorical_column(
-                categories=self.categories,
-                codes=column.as_column(other.base_data, dtype=other.dtype),
-                mask=other.base_mask,
-                ordered=self.ordered,
-                size=other.size,
-                offset=other.offset,
-                null_count=other.null_count,
+    def _apply_type_metadata(
+        self: CategoricalColumn, dtype: Dtype
+    ) -> CategoricalColumn:
+        if isinstance(dtype, CategoricalDtype):
+            self = column.build_categorical_column(
+                categories=dtype.categories._values,
+                codes=column.as_column(
+                    self.codes.base_data, dtype=self.codes.dtype
+                ),
+                mask=self.codes.base_mask,
+                ordered=dtype.ordered,
+                size=self.codes.size,
+                offset=self.codes.offset,
+                null_count=self.codes.null_count,
             )
-        # Have to ignore typing here because it misdiagnoses super().
-        return super()._copy_type_metadata(other)  # type: ignore
+
+        return self
 
 
 def _create_empty_categorical_column(

@@ -41,6 +41,7 @@
 from cudf.core.buffer import Buffer
 from cudf.core.dtypes import (
     CategoricalDtype,
+    Decimal64Dtype,
     IntervalDtype,
     ListDtype,
     StructDtype,
@@ -1267,6 +1268,9 @@ def scatter_to_table(
             }
         )
 
+    def _apply_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
+        return self
+
     def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
         """
         Copies type metadata from self onto other, returning a new column.
@@ -1276,17 +1280,7 @@ def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
           and the children of `other`.
         * if none of the above, return `other` without any changes
         """
-        # TODO: This logic should probably be moved to a common nested column
-        # class.
-        if isinstance(other, type(self)):
-            if self.base_children and other.base_children:
-                base_children = tuple(
-                    self.base_children[i]._copy_type_metadata(
-                        other.base_children[i]
-                    )
-                    for i in range(len(self.base_children))
-                )
-                other.set_base_children(base_children)
+        other = other._apply_type_metadata(self.dtype)
 
         return other
 
@@ -2200,6 +2194,17 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase:
     return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size)
 
 
+def _cudf_dtype_from_arrow_type(arrow_type: Dtype) -> Dtype:
+    if pa.types.is_decimal(arrow_type):
+        return Decimal64Dtype.from_arrow(arrow_type)
+    elif pa.types.is_struct(arrow_type):
+        return StructDtype.from_arrow(arrow_type)
+    elif pa.types.is_list(arrow_type):
+        return ListDtype.from_arrow(arrow_type)
+
+    return arrow_type
+
+
 def _copy_type_metadata_from_arrow(
     arrow_array: pa.array, cudf_column: ColumnBase
 ) -> ColumnBase:
@@ -2211,45 +2216,9 @@ def _copy_type_metadata_from_arrow(
     * When `arrow_array` is decimal type and `cudf_column` is
     Decimal64Dtype, copy precisions.
     """
-    if pa.types.is_decimal(arrow_array.type) and isinstance(
-        cudf_column, cudf.core.column.DecimalColumn
-    ):
-        cudf_column.dtype.precision = arrow_array.type.precision
-    elif pa.types.is_struct(arrow_array.type) and isinstance(
-        cudf_column, cudf.core.column.StructColumn
-    ):
-        base_children = tuple(
-            _copy_type_metadata_from_arrow(arrow_array.field(i), col_child)
-            for i, col_child in enumerate(cudf_column.base_children)
-        )
-        cudf_column.set_base_children(base_children)
-        return cudf.core.column.StructColumn(
-            data=None,
-            size=cudf_column.base_size,
-            dtype=StructDtype.from_arrow(arrow_array.type),
-            mask=cudf_column.base_mask,
-            offset=cudf_column.offset,
-            null_count=cudf_column.null_count,
-            children=base_children,
-        )
-    elif pa.types.is_list(arrow_array.type) and isinstance(
-        cudf_column, cudf.core.column.ListColumn
-    ):
-        if arrow_array.values and cudf_column.base_children:
-            base_children = (
-                cudf_column.base_children[0],
-                _copy_type_metadata_from_arrow(
-                    arrow_array.values, cudf_column.base_children[1]
-                ),
-            )
-            return cudf.core.column.ListColumn(
-                size=cudf_column.base_size,
-                dtype=ListDtype.from_arrow(arrow_array.type),
-                mask=cudf_column.base_mask,
-                offset=cudf_column.offset,
-                null_count=cudf_column.null_count,
-                children=base_children,
-            )
+    cudf_column = cudf_column._apply_type_metadata(
+        _cudf_dtype_from_arrow_type(arrow_array.type)
+    )
 
     return cudf_column
 

@@ -209,16 +209,13 @@ def __cuda_array_interface__(self):
             "Decimals are not yet supported via `__cuda_array_interface__`"
         )
 
-    def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
-        """Copies type metadata from self onto other, returning a new column.
+    def _apply_type_metadata(
+        self: "cudf.core.column.DecimalColumn", dtype: Dtype
+    ) -> "cudf.core.column.DecimalColumn":
+        if isinstance(dtype, Decimal64Dtype):
+            self.dtype.precision = dtype.precision
 
-        In addition to the default behavior, if `other` is also a decimal
-        column the precision is copied over.
-        """
-        if isinstance(other, DecimalColumn):
-            other.dtype.precision = self.dtype.precision  # type: ignore
-        # Have to ignore typing here because it misdiagnoses super().
-        return super()._copy_type_metadata(other)  # type: ignore
+        return self
 
 
 def _binop_scale(l_dtype, r_dtype, op):

@@ -16,7 +16,7 @@
     sort_lists,
 )
 from cudf._lib.table import Table
-from cudf._typing import BinaryOperand
+from cudf._typing import BinaryOperand, Dtype
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
@@ -233,6 +233,26 @@ def __cuda_array_interface__(self):
             "Lists are not yet supported via `__cuda_array_interface__`"
         )
 
+    def _apply_type_metadata(
+        self: "cudf.core.column.ListColumn", dtype: Dtype
+    ) -> "cudf.core.column.ListColumn":
+        if isinstance(dtype, ListDtype):
+            self = ListColumn(
+                size=self.base_size,
+                dtype=self.dtype,
+                mask=self.base_mask,
+                offset=self.offset,
+                null_count=self.null_count,
+                children=(
+                    self.base_children[0],
+                    self.base_children[1]._apply_type_metadata(
+                        dtype.element_type
+                    ),
+                ),
+            )
+
+        return self
+
 
 class ListMethods(ColumnMethodsMixin):
     """

@@ -21,7 +21,7 @@
     column,
     string,
 )
-from cudf.core.dtypes import Decimal64Dtype
+from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
 from cudf.utils import cudautils, utils
 from cudf.utils.dtypes import (
     NUMERIC_TYPES,
@@ -544,6 +544,20 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
 
         return False
 
+    def _apply_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
+        if isinstance(dtype, CategoricalDtype):
+            self = column.build_categorical_column(
+                categories=dtype.categories._values,
+                codes=as_column(self.base_data, dtype=self.dtype),
+                mask=self.base_mask,
+                ordered=dtype.ordered,
+                size=self.size,
+                offset=self.offset,
+                null_count=self.null_count,
+            )
+
+        return self
+
     def to_pandas(
         self, index: pd.Index = None, nullable: bool = False, **kwargs
     ) -> "pd.Series":

@@ -4,8 +4,10 @@
 import pyarrow as pa
 
 import cudf
+from cudf._typing import Dtype
 from cudf.core.column import ColumnBase
 from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.dtypes import StructDtype
 from cudf.utils.dtypes import is_struct_dtype
 
 
@@ -111,18 +113,22 @@ def __cuda_array_interface__(self):
             "Structs are not yet supported via `__cuda_array_interface__`"
         )
 
-    def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
-        """Copies type metadata from self onto other, returning a new column.
-
-        In addition to the default behavior, if `other` is a StructColumns we
-        rename the fields of `other` to the field names of `self`.
-        """
-        if isinstance(other, cudf.core.column.StructColumn):
-            other = other._rename_fields(
-                self.dtype.fields.keys()  # type: ignore
-            )
-        # Have to ignore typing here because it misdiagnoses super().
-        return super()._copy_type_metadata(other)  # type: ignore
+    def _apply_type_metadata(self: StructColumn, dtype: Dtype) -> StructColumn:
+        if isinstance(dtype, StructDtype):
+            self = StructColumn(
+                data=None,
+                size=self.base_size,
+                dtype=self.dtype,
+                mask=self.base_mask,
+                offset=self.offset,
+                null_count=self.null_count,
+                children=tuple(
+                    self.base_children[i]._apply_type_metadata(dtype.fields[f])
+                    for i, f in enumerate(dtype.fields.keys())
+                ),
+            )._rename_fields(dtype.fields.keys())
+
+        return self
 
 
 class StructMethods(ColumnMethodsMixin):

@@ -367,6 +367,32 @@ def test_as_column_buffer(data, expected):
     assert_eq(cudf.Series(actual_column), cudf.Series(expected))
 
 
+@pytest.mark.parametrize(
+    "data,expected",
+    [
+        (
+            pa.array([100, 200, 300], type=pa.decimal128(3)),
+            cudf.core.column.as_column(
+                [100, 200, 300], dtype=cudf.core.dtypes.Decimal64Dtype(3, 0)
+            ),
+        ),
+        (
+            pa.array([{"a": 1, "b": 3}, {"c": 2, "d": 4}]),
+            cudf.core.column.as_column([{"a": 1, "b": 3}, {"c": 2, "d": 4}]),
+        ),
+        (
+            pa.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]),
+            cudf.core.column.as_column(
+                [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
+            ),
+        ),
+    ],
+)
+def test_as_column_arrow_array(data, expected):
+    actual_column = cudf.core.column.as_column(data)
+    assert_eq(cudf.Series(actual_column), cudf.Series(expected))
+
+
 @pytest.mark.parametrize(
     "pd_dtype,expect_dtype",
     [