Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: Fix maybe_promote #25425

Closed
wants to merge 37 commits into from
Closed
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
bf05e4c
TST: add test coverage for maybe_promote
h-vetinari Feb 24, 2019
60889ea
REF: refactor and fix maybe_promote
h-vetinari Feb 24, 2019
6792a54
Fix remaining failures
h-vetinari Feb 24, 2019
c6043cb
Merge remote-tracking branch 'upstream/master' into fix_maybe_promote
h-vetinari Feb 24, 2019
8d9a3b7
Forgot to flake
h-vetinari Feb 24, 2019
b903d2e
Another try at int2int...
h-vetinari Feb 24, 2019
91e6673
Last one?
h-vetinari Feb 24, 2019
1a9d6a1
Merge remote-tracking branch 'upstream/master' into fix_maybe_promote
h-vetinari Mar 10, 2019
c0a3a4e
Review (jbrockmendel)
h-vetinari Mar 10, 2019
2d3a3ba
Merge remote-tracking branch 'upstream/master' into fix_maybe_promote
h-vetinari Jun 23, 2019
ce0efda
lint
h-vetinari Jun 23, 2019
e67dd99
reduce diff
h-vetinari Jun 23, 2019
7cec643
Merge remote-tracking branch 'upstream/master' into fix_maybe_promote
h-vetinari Jun 28, 2019
321f08d
review (jbrockmendel)
h-vetinari Jun 28, 2019
5c5e0f1
fix isort
h-vetinari Jun 28, 2019
bfb8a2f
fix docstring example
h-vetinari Jun 28, 2019
088787e
remove unicode from possible returns of infer_dtype
h-vetinari Jun 28, 2019
bf12d67
perf: only compute max if necessary
h-vetinari Jun 28, 2019
d9bbf07
update comment
h-vetinari Jun 28, 2019
0f1ff5f
Merge remote-tracking branch 'upstream/master' into fix_maybe_promote
h-vetinari Jul 26, 2019
076926e
format with black
h-vetinari Jul 26, 2019
06e22a3
remove usage of deprecated .real/.imag
h-vetinari Jul 26, 2019
0f8f064
lint
h-vetinari Jul 26, 2019
98c584e
work around GH 27610
h-vetinari Jul 26, 2019
231d8a6
Merge remote-tracking branch 'upstream/master' into fix_maybe_promote
h-vetinari Jul 28, 2019
2fa6831
Merge remote-tracking branch 'upstream/master' into fix_maybe_promote
h-vetinari Sep 9, 2019
1cbd5c0
Merge remote-tracking branch 'upstream/master' into fix_maybe_promote
h-vetinari Sep 21, 2019
32d8a2b
Review (jbrockmendel)
h-vetinari Sep 21, 2019
2a8691a
minor comment improvements/cleanups
h-vetinari Sep 21, 2019
d5aa77b
exploration: check if array-case is required
h-vetinari Sep 22, 2019
fa347b6
Revert "exploration: check if array-case is required"
h-vetinari Sep 22, 2019
8aab981
A painful merge
h-vetinari Oct 28, 2019
82ec973
adapt array-path to new test behaviour
h-vetinari Oct 28, 2019
3c5c3e0
Merge remote-tracking branch 'upstream/master' into fix_maybe_promote
h-vetinari Oct 29, 2019
b5eb1c4
lint: isort
h-vetinari Oct 29, 2019
3976220
catch irrelevant warning
h-vetinari Oct 29, 2019
b8cd4f0
fix outdated iNaT-documentation
h-vetinari Oct 29, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
314 changes: 314 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" routings for casting """

from datetime import datetime, timedelta
import warnings

import numpy as np

Expand Down Expand Up @@ -48,6 +49,7 @@
ABCDataFrame,
ABCDatetimeArray,
ABCDatetimeIndex,
ABCIndexClass,
ABCPeriodArray,
ABCPeriodIndex,
ABCSeries,
Expand All @@ -59,6 +61,9 @@
_int16_max = np.iinfo(np.int16).max
_int32_max = np.iinfo(np.int32).max
_int64_max = np.iinfo(np.int64).max
_int64_min = np.iinfo(np.int64).min
_uint64_max = np.iinfo(np.uint64).max
_float32_max = np.finfo(np.float32).max


def maybe_convert_platform(values):
Expand Down Expand Up @@ -335,6 +340,40 @@ def changeit():


def maybe_promote(dtype, fill_value=np.nan):
"""
Determine minimal dtype to hold fill_value, when starting from dtype

Parameters
----------
dtype : DType
The dtype to start from.
fill_value : scalar or np.ndarray / Series / Index
The value that the output dtype needs to be able to hold.

NOTE: using arrays is discouraged and will likely be removed from this
method in the foreseeable future. Use maybe_promote_with_array instead.

Returns
-------
dtype : DType
The updated dtype.
fill_value : scalar
The type of this value depends on the type of the passed fill_value

* If fill_value is a scalar, the method returns that scalar, but
modified to fit the updated dtype. For example, a datetime fill_value
will be returned as an integer (representing ns) for M8[ns], and
values considered missing (see pd.isna) will be returned as the
corresponding missing value marker for the updated dtype.
* If fill_value is an ndarray/Series/Index, this method will always
return the missing value marker for the updated dtype. This value
will be None for dtypes that cannot hold missing values (integers,
booleans, bytes).

See Also
--------
maybe_promote_with_array : underlying method for array case
"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A PR with just (most of) this docstring would be a good start

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will try to do that.

# if we passed an array here, determine the fill value by dtype
if isinstance(fill_value, np.ndarray):
if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)):
Expand Down Expand Up @@ -462,6 +501,281 @@ def maybe_promote(dtype, fill_value=np.nan):
return dtype, fill_value


def maybe_promote_with_array(dtype, fill_value=np.nan):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a huge additional to the technical debt. I am -1 on adding this at all. It is not at all clear whether this logic is correct and/or tested. more to the point, what is the purpose of all of this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback you can pretty much ignore this PR; I'm asking @h-vetinari to keep it rebased for reference as we identify parts that are worthwhile to break off into bite-size pieces.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

more to the point, what is the purpose of all of this?

There are a handful of places where we call maybe_promote where we could have fill_value that is an ndarray. Part of the plan for this is to identify in which of those cases we can rule out ndarray.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i that’s fine

happy to pick off good changes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback: this is a huge additional to the technical debt. I am -1 on adding this at all. It is not at all clear whether this logic is correct and/or tested. more to the point, what is the purpose of all of this?

Please read this comment, not just skim over it.

The array-path in maybe_promote is broken and both you and @jbrockmendel were excited to rip it out. At the same time, there's several potential or future use-cases for the array-case, and so I asked twice how this should be handled.

Having a separate method is IMO the least invasive change, and would eventually still allow to rip out the array-path from maybe_promote. And more importantly, the logic is tested with the same promotion tests, which was the whole point of the tests/dtypes/cast/test_promote.py-module. Lastly, since it's a private method, there's no technical debt.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lastly, since it's a private method, there's no technical debt.

@h-vetinari i don’t even know what to say anymore

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the misunderstanding, I meant as in API debt.

The technical debt is already there, in the array-path of maybe_promote. I'm trying to fix it. Feel free to address any of the comments or questions I've raised about this. But if you come into an ancient PR and - without regard for any of the existing context - assert that it must be garbage ("It is not at all clear whether this logic is correct and/or tested [it is]. more to the point, what's the purpose of all of this?"), then I'm gonna respond in kind.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@h-vetinari pls don't respond like this. it is not helpful to anyone.

I can and will come into every PR and make comments. My purpose is to avoid cluttering pandas with technical debt. This PR just adds to it.

"""
Determine minimal dtype to hold fill_value, when starting from dtype

This will also return the default missing value for the resulting dtype, if
necessary (e.g. for datetime / timedelta, the missing value will be `iNaT`)

Parameters
----------
dtype : DType
The dtype to start from.
fill_value : np.ndarray / Series / Index
Array-like of values that the output dtype needs to be able to hold.

Returns
-------
dtype : DType
The updated dtype.
na_value : scalar
The missing value for the new dtype. Returns None or dtypes that
cannot hold missing values (integers, booleans, bytes).

See Also
--------
maybe_promote : similar method for scalar case

Examples
--------
>>> maybe_promote_with_array(np.dtype('int'), fill_value=np.array([None]))
(dtype('float64'), nan)
>>> maybe_promote_with_array(np.dtype('float'),
... fill_value=np.array(['abcd']))
(dtype('O'), nan)

For datetimes, timedeltas and datetimes with a timezone, the missing value
marker is pandas._libs.tslibs.iNaT (== np.iinfo('int64').min):

>>> maybe_promote_with_array(np.dtype('datetime64[ns]'),
... fill_value=np.array([None]))
(dtype('<M8[ns]'), -9223372036854775808)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: fix this


String values do not get cast to datetime/timedelta automatically, but
force an upcast to object (with corresponding missing value marker nan).

>>> maybe_promote_with_array(np.dtype('datetime64[ns]'),
... fill_value=np.array(['2018-01-01']))
(dtype('O'), nan)

The method will infer as conservatively as possible for integer types:

>>> maybe_promote_with_array(
... np.dtype('uint8'), fill_value=np.array([np.iinfo('uint8').max + 1])
... )
(dtype('uint16'), None)
>>> maybe_promote_with_array(np.dtype('uint8'), fill_value=np.array([-1]))
(dtype('int16'), None)
"""

if isinstance(fill_value, np.ndarray):
if fill_value.ndim == 0:
# zero-dimensional arrays cannot be iterated over
fill_value = np.expand_dims(fill_value, 0)
elif fill_value.ndim > 1:
# ndarray, but too high-dimensional
fill_value = fill_value.ravel()
elif not isinstance(fill_value, (ABCSeries, ABCIndexClass)):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need Series/Index? ATM we just have ndarray right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see no harm in permitting them (if arrays are permitted at all), as they they fit into the code without extra effort (and later uses of maybe_upcast_putmask might well plop a Series in there).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Until we have a compelling use case, let's restrict the inputs to 1D, non-empty ndarray

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK

fill_type = type(fill_value).__name__
raise ValueError(
"fill_value must either be a Series / Index / "
"np.ndarray, received {}".format(fill_type)
)

if all(isna(x) for x in fill_value):
# only missing values (or no values at all)

if is_datetime64_dtype(dtype):
return dtype, np.datetime64("NaT", "ns")
elif is_timedelta64_dtype(dtype):
return dtype, np.timedelta64("NaT", "ns")
elif is_datetime64tz_dtype(dtype):
return dtype, NaT

na_value = np.nan
if len(fill_value) == 0:
# empty array; no values to force change
if is_integer_dtype(dtype) or dtype in (bool, bytes):
# these types do not have a missing value marker
na_value = None
# otherwise nothing changes
elif any(x is NaT for x in fill_value):
# presence of pd.NaT upcasts everything that's not
# datetime/timedelta (see above) to object
dtype = np.dtype(object)
elif is_integer_dtype(dtype):
# integer + other missing value (np.nan / None) casts to float
dtype = np.dtype("float64")
elif is_extension_array_dtype(dtype):
na_value = dtype.na_value
elif is_string_dtype(dtype) or dtype in (bool, bytes):
# original dtype cannot hold nans
dtype = np.dtype(object)

return dtype, na_value

fill_dtype = fill_value.dtype
if fill_dtype == object:
# for object dtype, we determine if we actually need to upcast
# by inferring the dtype of fill_value
inferred_dtype = lib.infer_dtype(fill_value, skipna=True)

# cases that would yield 'empty' have been treated in branch above
if inferred_dtype in ["period", "interval", "datetime64tz"]:
# TODO: handle & test pandas-dtypes
# TODO: lib.infer_dtype does not support datetime64tz yet
pass
else:
# rest can be mapped to numpy dtypes
map_inferred_to_numpy = {
"floating": float,
"mixed-integer-float": float,
"decimal": float,
"integer": int,
"boolean": bool,
"complex": complex,
"bytes": bytes,
"datetime64": "datetime64[ns]",
"datetime": "datetime64[ns]",
"date": "datetime64[ns]",
"timedelta64": "timedelta64[ns]",
"timedelta": "timedelta64[ns]",
"time": object, # time cannot be cast to datetime/timedelta
"string": object,
"mixed-integer": object,
"mixed": object,
}
fill_dtype = np.dtype(map_inferred_to_numpy[inferred_dtype])

# now that we have the correct dtype; check how we must upcast
# * extension arrays
# * int vs int
# * int vs float / complex
# * float vs float
# * float vs complex (and vice versa)
# * bool
# * datetimetz
# * datetime
# * timedelta
# * string/object

# if (is_extension_array_dtype(dtype)
# or is_extension_array_dtype(fill_dtype)):
# # TODO: dispatch to ExtensionDType.maybe_promote? GH 24246
if is_integer_dtype(dtype) and is_integer_dtype(fill_dtype):
if is_unsigned_integer_dtype(dtype) and all(fill_value >= 0):
# can stay unsigned
fill_max = fill_value.max()
if fill_max > _uint64_max:
return np.dtype(object), np.nan

while fill_max > np.iinfo(dtype).max:
# itemsize is the number of bytes; times eight is number of
# bits, which is used in the string identifier of the dtype;
# if fill_max is above the max for that dtype,
# we double the number of bytes/bits.
dtype = np.dtype("uint{}".format(dtype.itemsize * 8 * 2))
return dtype, None
else:
# cannot stay unsigned
if dtype == "uint64":
# need to hold negative values, but int64 cannot hold
# maximum of uint64 -> needs object
return np.dtype(object), np.nan
elif is_unsigned_integer_dtype(dtype):
# need to turn into signed integers to hold negative values
# int8 cannot hold maximum of uint8; similar for 16/32
# therefore, upcast at least to next higher int-type
dtype = np.dtype("int{}".format(dtype.itemsize * 8 * 2))

fill_max = fill_value.max()
fill_min = fill_value.min()
if isinstance(fill_max, np.uint64):
# numpy comparator is broken for uint64;
# see https://github.com/numpy/numpy/issues/12525
# use .item to get int object
fill_max = fill_max.item()

# comparison mechanics are broken above _int64_max;
# use greater equal instead of equal
if fill_max >= _int64_max + 1 or fill_min <= _int64_min - 1:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use the can_cast machinery machinery currently in the scalar function? or even just dispatch to the scalar function in some cases?

Copy link
Contributor Author

@h-vetinari h-vetinari Oct 30, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dispatching to the scalar case is IMO out of the question for performance reasons until this whole code is cythonized (or the logic somehow unified with lib.maybe_convert_object).

return np.dtype(object), np.nan

while fill_max > np.iinfo(dtype).max or fill_min < np.iinfo(dtype).min:
# same mechanism as above, but for int instead of uint
dtype = np.dtype("int{}".format(dtype.itemsize * 8 * 2))
return dtype, None
elif is_integer_dtype(dtype) and is_float_dtype(fill_dtype):
# int with float: always upcasts to float64
return np.dtype("float64"), np.nan
elif is_integer_dtype(dtype) and is_complex_dtype(fill_dtype):
# int with complex: always upcasts to complex128
return np.dtype("complex128"), np.nan
elif (is_float_dtype(dtype) or is_complex_dtype(dtype)) and is_integer_dtype(
fill_dtype
):
# float/complex with int: always stays original float/complex dtype
return dtype, np.nan
elif is_float_dtype(dtype) and is_float_dtype(fill_dtype):
# float with float; upcasts depending on absolute max of fill_value
if dtype == "float32" and np.abs(fill_value).max() <= _float32_max:
return dtype, np.nan
# all other cases return float64
return np.dtype("float64"), np.nan
elif (is_float_dtype(dtype) or is_complex_dtype(dtype)) and (
is_float_dtype(fill_dtype) or is_complex_dtype(fill_dtype)
):
# at least one is complex; otherwise we'd have hit float/float above
with warnings.catch_warnings():
# work around GH 27610
warnings.filterwarnings("ignore", category=FutureWarning)
if (
dtype in ["float32", "complex64"]
and max(
np.abs(np.real(fill_value)).max(), # also works for float
np.abs(np.imag(fill_value)).max(),
)
<= _float32_max
):
return np.complex64, np.nan
# all other cases return complex128
return np.dtype("complex128"), np.nan
elif is_bool_dtype(dtype) and is_bool_dtype(fill_dtype):
# bool with bool is the only combination that stays bool; any other
# combination involving bool upcasts to object, see else-clause below
return dtype, None
elif (
is_datetime64tz_dtype(dtype)
and is_datetime64tz_dtype(fill_dtype)
and (dtype.tz == fill_dtype.tz)
):
# datetimetz with datetimetz with the same timezone is the only
# combination that stays datetimetz (in particular, mixing timezones or
# tz-aware and tz-naive datetimes will cast to object); any other
# combination involving datetimetz upcasts to object, see below
return dtype, NaT
elif (is_timedelta64_dtype(dtype) and is_timedelta64_dtype(fill_dtype)) or (
is_datetime64_dtype(dtype) and is_datetime64_dtype(fill_dtype)
):
# datetime and timedelta try to cast; if successful, keep dtype,
# otherwise upcast to object
try:
with warnings.catch_warnings():
msg = (
"parsing timezone aware datetimes is deprecated; "
"this will raise an error in the future"
)
warnings.filterwarnings(
"ignore", message=msg, category=DeprecationWarning
)
fill_value.astype(dtype)

# can simplify if-cond. compared to cond. for entering this branch
if is_datetime64_dtype(dtype):
na_value = np.datetime64("NaT", "ns")
else:
na_value = np.timedelta64("NaT", "ns")
except (ValueError, TypeError):
dtype = np.dtype(object)
na_value = np.nan
return dtype, na_value
else:
# anything else (e.g. strings, objects, bytes, or unmatched
# bool / datetime / datetimetz / timedelta)
return np.dtype(object), np.nan


def _ensure_dtype_type(value, dtype):
"""
Ensure that the given value is an instance of the given dtype.
Expand Down
Loading