Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add implicit typecasting of join columns when dtypes do not match #3451

Merged
merged 34 commits into from
Jan 12, 2020
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
184b5ba
baseline numeric test
brandon-b-miller Nov 13, 2019
e70ef39
baseline numeric implementation
brandon-b-miller Nov 13, 2019
58aa9c9
test for everything pandas supports, else skip
brandon-b-miller Nov 13, 2019
57a4bd7
BROKEN: handle left-categorical cases
brandon-b-miller Nov 15, 2019
3fb5d95
Merge branch 'branch-0.11' into enh-typecast-on-join
brandon-b-miller Nov 18, 2019
817410d
add datetime only test
brandon-b-miller Nov 18, 2019
84dd42a
implement upcasting for datetime
brandon-b-miller Nov 18, 2019
425afc4
abandon pandas logic and invent our own
brandon-b-miller Nov 19, 2019
5dee693
mixed int/float test
brandon-b-miller Nov 19, 2019
19c5641
refactor logic, add tests
brandon-b-miller Nov 21, 2019
eccaebc
handle categorical-non categorical merge cases
brandon-b-miller Nov 22, 2019
ca10de1
maybe last categorical bug fixed, all cudf tests pass
brandon-b-miller Nov 22, 2019
618a9cf
remove unused code
brandon-b-miller Nov 22, 2019
a364ff6
handle overflow, refactor
brandon-b-miller Nov 25, 2019
4abd247
merge 0.11
brandon-b-miller Nov 25, 2019
40cbf9f
fix tests
brandon-b-miller Nov 25, 2019
cc74a2a
style
brandon-b-miller Nov 25, 2019
b322d10
changelog
brandon-b-miller Nov 25, 2019
8865df2
pass colname mismatches to libcudf to error
brandon-b-miller Nov 26, 2019
94dd0f7
relocate tests to test_joining and rename
brandon-b-miller Nov 26, 2019
77b8c16
test overflow guard
brandon-b-miller Dec 12, 2019
80984e9
implement overflow_safe_to
brandon-b-miller Dec 12, 2019
e4ac504
use overflow_safe_to in _typecast_before_merge and fillna
brandon-b-miller Dec 12, 2019
e2da964
Merge branch 'branch-0.12' into enh-typecast-on-join
brandon-b-miller Dec 24, 2019
09447f4
style
brandon-b-miller Jan 7, 2020
7ecf789
style
brandon-b-miller Jan 7, 2020
38b6d58
Merge branch 'branch-0.12' into enh-typecast-on-join
brandon-b-miller Jan 9, 2020
e47b96f
fix accidental test inversion from style correction
brandon-b-miller Jan 10, 2020
d6947ac
merge refactor, solve categorical test failures
brandon-b-miller Jan 10, 2020
d49d211
Merge branch 'branch-0.12' into enh-typecast-on-join
brandon-b-miller Jan 10, 2020
8ea0e80
update error formatting
brandon-b-miller Jan 10, 2020
8203487
raise when categories do not match for a column
brandon-b-miller Jan 10, 2020
80de301
overflow_safe_to -> can_cast_safely
brandon-b-miller Jan 11, 2020
5fd8bf1
fix tests
brandon-b-miller Jan 11, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@
- PR #3500 cudf::fill()/cudf::repeat() support for strings columns.
- PR #3438 Update scalar and scalar_device_view to better support strings
- PR #3414 Add copy_range function for strings column
- PR #3451 Add support for implicit typecasting of join columns

## Bug Fixes

Expand Down
55 changes: 55 additions & 0 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,61 @@ def is_monotonic_decreasing(self):
)
return self._is_monotonic_decreasing

def can_cast_safely(self, to_dtype):
"""
Returns true if all the values in self can be
safely cast to dtype
"""
if self.dtype.kind == to_dtype.kind:
if self.dtype <= to_dtype:
return True
else:
# Kinds are the same but to_dtype is smaller
if "float" in to_dtype.name:
info = np.finfo(to_dtype)
elif "int" in to_dtype.name:
info = np.iinfo(to_dtype)
min_, max_ = info.min, info.max

if (self.min() > min_) and (self.max() < max_):
return True
else:
return False

# want to cast int to float
elif to_dtype.kind == "f" and self.dtype.kind == "i":
info = np.finfo(to_dtype)
biggest_exact_int = 2 ** (info.nmant + 1)
if (self.min() >= -biggest_exact_int) and (
self.max() <= biggest_exact_int
):
return True
else:
from cudf import Series

if (
Series(self).astype(to_dtype).astype(self.dtype)
== Series(self)
).all():
return True
else:
return False

# want to cast float to int:
elif to_dtype.kind == "i" and self.dtype.kind == "f":
info = np.iinfo(to_dtype)
min_, max_ = info.min, info.max
# best we can do is hope to catch it here and avoid compare
if (self.min() >= min_) and (self.max() <= max_):
from cudf import Series

if (Series(self) % 1 == 0).all():
return True
else:
return False
else:
return False


def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
if reflect:
Expand Down
96 changes: 95 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2203,6 +2203,87 @@ def melt(self, **kwargs):

return melt(self, **kwargs)

def _typecast_before_merge(self, lhs, rhs, left_on, right_on, how):
def casting_rules(dtype_l, dtype_r, how):
cast_warn = "can't safely cast column {} from {} with type \
{} to {}, upcasting to {}"
ctgry_err = "can't implicitly cast column {0} to categories \
from {1} during {1} join"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to change this; just a comment that it might be preferable to do something like:

ctgry_err = ("can't implicitly cast column {column_id} to categories"
                   " from {how} during {how} join")

ctgry_err.format(column_id=rcol, how="right")


rtn = None
if pd.api.types.is_dtype_equal(dtype_l, dtype_r):
rtn = dtype_l
elif is_categorical_dtype(dtype_l) and is_categorical_dtype(
dtype_r
):
raise TypeError("Left and right categories must be the same.")
elif how == "left":

check_col = rhs._data[rcol].fillna(0)
if not check_col.can_cast_safely(dtype_l):
rtn = casting_rules(dtype_l, dtype_r, "inner")
warnings.warn(
cast_warn.format(rcol, "right", dtype_r, dtype_l, rtn)
)
else:
rtn = dtype_l
elif how == "right":
check_col = lhs._data[lcol].fillna(0)
if not check_col.can_cast_safely(dtype_r):
rtn = casting_rules(dtype_l, dtype_r, "inner")
warnings.warn(
cast_warn.format(lcol, "left", dtype_l, dtype_r, rtn)
)
else:
rtn = dtype_r

elif is_categorical_dtype(dtype_l):
if how == "right":
raise ValueError(ctgry_err.format(rcol, "right"))

rtn = lhs[lcol].cat.categories.dtype
to_categorical.append(lcol)
lhs[lcol + "_codes"] = lhs[lcol].cat.codes
elif is_categorical_dtype(dtype_r):
if how == "left":
raise ValueError(ctgry_err.format(lcol, "left"))
rtn = rhs[rcol].cat.categories.dtype
to_categorical.append(rcol)
rhs[rcol + "_codes"] = rhs[rcol].cat.codes
elif how in ["inner", "outer"]:
if (np.issubdtype(dtype_l, np.number)) and (
np.issubdtype(dtype_r, np.number)
):
if dtype_l.kind == dtype_r.kind:
# both ints or both floats
rtn = max(dtype_l, dtype_r)
else:
rtn = np.find_common_type([], [dtype_l, dtype_r])
elif is_datetime_dtype(dtype_l) and is_datetime_dtype(dtype_r):
rtn = max(dtype_l, dtype_r)
return rtn

left_on = sorted(left_on)
right_on = sorted(right_on)
to_categorical = []
for lcol, rcol in zip(left_on, right_on):
if (lcol not in lhs._data) or (rcol not in rhs._data):
# probably wrong columns specified, let libcudf error
continue

dtype_l = lhs._data[lcol].dtype
dtype_r = rhs._data[rcol].dtype
if pd.api.types.is_dtype_equal(dtype_l, dtype_r):
continue

to_dtype = casting_rules(dtype_l, dtype_r, how)

if to_dtype is not None:
lhs[lcol] = lhs[lcol].astype(to_dtype)
rhs[rcol] = rhs[rcol].astype(to_dtype)

return lhs, rhs, to_categorical

def merge(
self,
right,
Expand Down Expand Up @@ -2407,6 +2488,10 @@ def merge(
# Save the order of the original column names for preservation later
org_names = list(itertools.chain(lhs._data.keys(), rhs._data.keys()))

# potentially do an implicit typecast
(lhs, rhs, to_categorical) = self._typecast_before_merge(
lhs, rhs, left_on, right_on, how
)
# Compute merge
gdf_result = libcudf.join.join(
lhs._data, rhs._data, left_on, right_on, how, method
Expand All @@ -2415,6 +2500,7 @@ def merge(
# Let's sort the columns of the GDF result. NB: Pandas doc says
# that it sorts when how='outer' but this is NOT the case.
result = []
cat_codes = []
if sort:
# Pandas lexicographically sort is NOT a sort of all columns.
# Instead, it sorts columns in lhs, then in "on", and then rhs.
Expand Down Expand Up @@ -2450,18 +2536,26 @@ def merge(
if gdf_result[i][1] == org_name:
result.append(gdf_result.pop(i))
break
for cat_name in to_categorical:
for i in range(len(gdf_result)):
if gdf_result[i][1] == cat_name + "_codes":
cat_codes.append(gdf_result.pop(i))
assert len(gdf_result) == 0

cat_codes = {v: k for k, v in cat_codes}

# Build a new data frame based on the merged columns from GDF

df = DataFrame()
for col, name in result:
if is_string_dtype(col):
df[name] = col
elif is_categorical_dtype(categorical_dtypes.get(name, col.dtype)):

dtype = categorical_dtypes.get(name, col.dtype)
df[name] = column.build_categorical_column(
categories=dtype.categories,
codes=col,
codes=cat_codes.get(name + "_codes", col),
mask=col.mask,
ordered=dtype.ordered,
)
Expand Down
Loading