rapidsai · kkraus14 · Jan 12, 2020 · Nov 13, 2019 · Nov 13, 2019 · Nov 13, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -256,6 +256,7 @@
 - PR #3500 cudf::fill()/cudf::repeat() support for strings columns.
 - PR #3438 Update scalar and scalar_device_view to better support strings
 - PR #3414 Add copy_range function for strings column
+- PR #3451 Add support for implicit typecasting of join columns
 
 ## Bug Fixes
 

@@ -412,6 +412,61 @@ def is_monotonic_decreasing(self):
                 )
         return self._is_monotonic_decreasing
 
+    def can_cast_safely(self, to_dtype):
+        """
+        Returns true if all the values in self can be
+        safely cast to dtype
+        """
+        if self.dtype.kind == to_dtype.kind:
+            if self.dtype <= to_dtype:
+                return True
+            else:
+                # Kinds are the same but to_dtype is smaller
+                if "float" in to_dtype.name:
+                    info = np.finfo(to_dtype)
+                elif "int" in to_dtype.name:
+                    info = np.iinfo(to_dtype)
+                min_, max_ = info.min, info.max
+
+                if (self.min() > min_) and (self.max() < max_):
+                    return True
+                else:
+                    return False
+
+        # want to cast int to float
+        elif to_dtype.kind == "f" and self.dtype.kind == "i":
+            info = np.finfo(to_dtype)
+            biggest_exact_int = 2 ** (info.nmant + 1)
+            if (self.min() >= -biggest_exact_int) and (
+                self.max() <= biggest_exact_int
+            ):
+                return True
+            else:
+                from cudf import Series
+
+                if (
+                    Series(self).astype(to_dtype).astype(self.dtype)
+                    == Series(self)
+                ).all():
+                    return True
+                else:
+                    return False
+
+        # want to cast float to int:
+        elif to_dtype.kind == "i" and self.dtype.kind == "f":
+            info = np.iinfo(to_dtype)
+            min_, max_ = info.min, info.max
+            # best we can do is hope to catch it here and avoid compare
+            if (self.min() >= min_) and (self.max() <= max_):
+                from cudf import Series
+
+                if (Series(self) % 1 == 0).all():
+                    return True
+                else:
+                    return False
+            else:
+                return False
+
 
 def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
     if reflect:

@@ -2203,6 +2203,87 @@ def melt(self, **kwargs):
 
         return melt(self, **kwargs)
 
+    def _typecast_before_merge(self, lhs, rhs, left_on, right_on, how):
+        def casting_rules(dtype_l, dtype_r, how):
+            cast_warn = "can't safely cast column {} from {} with type \
+                         {} to {}, upcasting to {}"
+            ctgry_err = "can't implicitly cast column {0} to categories \
+                         from {1} during {1} join"
+
+            rtn = None
+            if pd.api.types.is_dtype_equal(dtype_l, dtype_r):
+                rtn = dtype_l
+            elif is_categorical_dtype(dtype_l) and is_categorical_dtype(
+                dtype_r
+            ):
+                raise TypeError("Left and right categories must be the same.")
+            elif how == "left":
+
+                check_col = rhs._data[rcol].fillna(0)
+                if not check_col.can_cast_safely(dtype_l):
+                    rtn = casting_rules(dtype_l, dtype_r, "inner")
+                    warnings.warn(
+                        cast_warn.format(rcol, "right", dtype_r, dtype_l, rtn)
+                    )
+                else:
+                    rtn = dtype_l
+            elif how == "right":
+                check_col = lhs._data[lcol].fillna(0)
+                if not check_col.can_cast_safely(dtype_r):
+                    rtn = casting_rules(dtype_l, dtype_r, "inner")
+                    warnings.warn(
+                        cast_warn.format(lcol, "left", dtype_l, dtype_r, rtn)
+                    )
+                else:
+                    rtn = dtype_r
+
+            elif is_categorical_dtype(dtype_l):
+                if how == "right":
+                    raise ValueError(ctgry_err.format(rcol, "right"))
+
+                rtn = lhs[lcol].cat.categories.dtype
+                to_categorical.append(lcol)
+                lhs[lcol + "_codes"] = lhs[lcol].cat.codes
+            elif is_categorical_dtype(dtype_r):
+                if how == "left":
+                    raise ValueError(ctgry_err.format(lcol, "left"))
+                rtn = rhs[rcol].cat.categories.dtype
+                to_categorical.append(rcol)
+                rhs[rcol + "_codes"] = rhs[rcol].cat.codes
+            elif how in ["inner", "outer"]:
+                if (np.issubdtype(dtype_l, np.number)) and (
+                    np.issubdtype(dtype_r, np.number)
+                ):
+                    if dtype_l.kind == dtype_r.kind:
+                        # both ints or both floats
+                        rtn = max(dtype_l, dtype_r)
+                    else:
+                        rtn = np.find_common_type([], [dtype_l, dtype_r])
+                elif is_datetime_dtype(dtype_l) and is_datetime_dtype(dtype_r):
+                    rtn = max(dtype_l, dtype_r)
+            return rtn
+
+        left_on = sorted(left_on)
+        right_on = sorted(right_on)
+        to_categorical = []
+        for lcol, rcol in zip(left_on, right_on):
+            if (lcol not in lhs._data) or (rcol not in rhs._data):
+                # probably wrong columns specified, let libcudf error
+                continue
+
+            dtype_l = lhs._data[lcol].dtype
+            dtype_r = rhs._data[rcol].dtype
+            if pd.api.types.is_dtype_equal(dtype_l, dtype_r):
+                continue
+
+            to_dtype = casting_rules(dtype_l, dtype_r, how)
+
+            if to_dtype is not None:
+                lhs[lcol] = lhs[lcol].astype(to_dtype)
+                rhs[rcol] = rhs[rcol].astype(to_dtype)
+
+        return lhs, rhs, to_categorical
+
     def merge(
         self,
         right,
@@ -2407,6 +2488,10 @@ def merge(
         # Save the order of the original column names for preservation later
         org_names = list(itertools.chain(lhs._data.keys(), rhs._data.keys()))
 
+        # potentially do an implicit typecast
+        (lhs, rhs, to_categorical) = self._typecast_before_merge(
+            lhs, rhs, left_on, right_on, how
+        )
         # Compute merge
         gdf_result = libcudf.join.join(
             lhs._data, rhs._data, left_on, right_on, how, method
@@ -2415,6 +2500,7 @@ def merge(
         # Let's sort the columns of the GDF result. NB: Pandas doc says
         # that it sorts when how='outer' but this is NOT the case.
         result = []
+        cat_codes = []
         if sort:
             # Pandas lexicographically sort is NOT a sort of all columns.
             # Instead, it sorts columns in lhs, then in "on", and then rhs.
@@ -2450,18 +2536,26 @@ def merge(
                     if gdf_result[i][1] == org_name:
                         result.append(gdf_result.pop(i))
                         break
+            for cat_name in to_categorical:
+                for i in range(len(gdf_result)):
+                    if gdf_result[i][1] == cat_name + "_codes":
+                        cat_codes.append(gdf_result.pop(i))
             assert len(gdf_result) == 0
 
+        cat_codes = {v: k for k, v in cat_codes}
+
         # Build a new data frame based on the merged columns from GDF
+
         df = DataFrame()
         for col, name in result:
             if is_string_dtype(col):
                 df[name] = col
             elif is_categorical_dtype(categorical_dtypes.get(name, col.dtype)):
+
                 dtype = categorical_dtypes.get(name, col.dtype)
                 df[name] = column.build_categorical_column(
                     categories=dtype.categories,
-                    codes=col,
+                    codes=cat_codes.get(name + "_codes", col),
                     mask=col.mask,
                     ordered=dtype.ordered,
                 )