Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TYP: core/dtypes/cast.py #37024

Merged
merged 13 commits into from
Oct 14, 2020
98 changes: 70 additions & 28 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,18 @@
"""

from datetime import date, datetime, timedelta
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type
from typing import (
TYPE_CHECKING,
Any,
List,
Optional,
Sequence,
Set,
Sized,
Tuple,
Type,
Union,
)

import numpy as np

Expand All @@ -18,7 +29,7 @@
ints_to_pydatetime,
)
from pandas._libs.tslibs.timezones import tz_compare
from pandas._typing import ArrayLike, Dtype, DtypeObj
from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Scalar
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -83,6 +94,8 @@
if TYPE_CHECKING:
from pandas import Series
from pandas.core.arrays import ExtensionArray
from pandas.core.indexes.base import Index
from pandas.core.indexes.datetimes import DatetimeIndex

_int8_max = np.iinfo(np.int8).max
_int16_max = np.iinfo(np.int16).max
Expand Down Expand Up @@ -118,7 +131,7 @@ def is_nested_object(obj) -> bool:
return False


def maybe_downcast_to_dtype(result, dtype):
def maybe_downcast_to_dtype(result, dtype: Dtype):
"""
try to cast to the specified dtype (e.g. convert back to bool/int
or could be an astype of float64->float32
Expand Down Expand Up @@ -186,7 +199,7 @@ def maybe_downcast_to_dtype(result, dtype):
return result


def maybe_downcast_numeric(result, dtype, do_round: bool = False):
def maybe_downcast_numeric(result, dtype: DtypeObj, do_round: bool = False):
"""
Subset of maybe_downcast_to_dtype restricted to numeric dtypes.

Expand Down Expand Up @@ -329,7 +342,9 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj:
return dtype


def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None):
def maybe_cast_to_extension_array(
cls: Type["ExtensionArray"], obj: ArrayLike, dtype: Optional[ExtensionDtype] = None
) -> ArrayLike:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ArrayLike is a TypeVar. so the return type is typed to be the same as obj.

I think mypy is accepting this since np.ndarray resolves to Any

We probably need an ArrayLikeUnion, but this is perhaps out of scope here as the discussion has been started elsewhere. xref #36100

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall I remove the annotation and add a TODO?

As an aside FWIW I like the ArrayLike/ArrayLikeT idea in https://github.com/pandas-dev/pandas/pull/36100/files#r483753222

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can leave as is for now. we have #36092 to track the issues once np.ndarray no longer resolves to Any.

"""
Call to `_from_sequence` that returns the object unchanged on Exception.

Expand Down Expand Up @@ -362,7 +377,9 @@ def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None):
return result


def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other):
def maybe_upcast_putmask(
result: np.ndarray, mask: np.ndarray, other: Scalar
) -> Tuple[np.ndarray, bool]:
"""
A safe version of putmask that potentially upcasts the result.

Expand Down Expand Up @@ -444,7 +461,9 @@ def changeit():
return result, False


def maybe_casted_values(index, codes=None):
def maybe_casted_values(
index: "Index", codes: Optional[np.ndarray] = None
) -> ArrayLike:
"""
Convert an index, given directly or as a pair (level, code), to a 1D array.

Expand All @@ -468,7 +487,7 @@ def maybe_casted_values(index, codes=None):

# if we have the codes, extract the values with a mask
if codes is not None:
mask = codes == -1
mask: np.ndarray = codes == -1
Copy link
Member

@simonjayhawkins simonjayhawkins Oct 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why was this added? mypy is not reporting any errors for me with this removed.

generally, try to avoid adding variable type annotations. should be inferred from expression.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mistake, will revert

I had tried typing index and codes but there were a lot of errors - this was part of an attempted fix.


# we can have situations where the whole mask is -1,
# meaning there is nothing found in codes, so make all nan's
Expand Down Expand Up @@ -660,7 +679,7 @@ def maybe_promote(dtype, fill_value=np.nan):
return dtype, fill_value


def _ensure_dtype_type(value, dtype):
def _ensure_dtype_type(value, dtype: DtypeObj):
"""
Ensure that the given value is an instance of the given dtype.

Expand Down Expand Up @@ -786,8 +805,9 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj,
return dtype, val


# TODO: try to make the Any in the return annotation more specific
def infer_dtype_from_array(arr, pandas_dtype: bool = False) -> Tuple[DtypeObj, Any]:
def infer_dtype_from_array(
arr, pandas_dtype: bool = False
) -> Tuple[DtypeObj, ArrayLike]:
"""
Infer the dtype from an array.

Expand Down Expand Up @@ -875,7 +895,12 @@ def maybe_infer_dtype_type(element):
return tipo


def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False):
def maybe_upcast(
values: ArrayLike,
fill_value: Scalar = np.nan,
dtype: Dtype = None,
copy: bool = False,
) -> Tuple[ArrayLike, Scalar]:
"""
Provide explicit type promotion and coercion.

Expand All @@ -887,6 +912,13 @@ def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False):
dtype : if None, then use the dtype of the values, else coerce to this type
copy : bool, default True
If True always make a copy even if no upcast is required.

Returns
-------
values: ndarray or ExtensionArray
the original array, possibly upcast
fill_value:
the fill value, possibly upcast
"""
if not is_scalar(fill_value) and not is_object_dtype(values.dtype):
# We allow arbitrary fill values for object dtype
Expand All @@ -907,7 +939,7 @@ def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False):
return values, fill_value


def invalidate_string_dtypes(dtype_set):
def invalidate_string_dtypes(dtype_set: Set[DtypeObj]):
"""
Change string like dtypes to object for
``DataFrame.select_dtypes()``.
Expand All @@ -929,7 +961,7 @@ def coerce_indexer_dtype(indexer, categories):
return ensure_int64(indexer)


def coerce_to_dtypes(result, dtypes):
def coerce_to_dtypes(result: Sequence[Scalar], dtypes: Sequence[Dtype]) -> List[Scalar]:
"""
given a dtypes and a result set, coerce the result elements to the
dtypes
Expand Down Expand Up @@ -959,7 +991,9 @@ def conv(r, dtype):
return [conv(r, dtype) for r, dtype in zip(result, dtypes)]


def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
def astype_nansafe(
arr, dtype: DtypeObj, copy: bool = True, skipna: bool = False
) -> ArrayLike:
"""
Cast the elements of an array to a given dtype a nan-safe manner.

Expand Down Expand Up @@ -1063,7 +1097,9 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
return arr.view(dtype)


def maybe_convert_objects(values: np.ndarray, convert_numeric: bool = True):
def maybe_convert_objects(
values: np.ndarray, convert_numeric: bool = True
) -> Union[np.ndarray, "DatetimeIndex"]:
"""
If we have an object dtype array, try to coerce dates and/or numbers.

Expand Down Expand Up @@ -1184,7 +1220,7 @@ def soft_convert_objects(


def convert_dtypes(
input_array,
input_array: AnyArrayLike,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you update the docstring to match?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

convert_string: bool = True,
convert_integer: bool = True,
convert_boolean: bool = True,
Expand All @@ -1195,7 +1231,7 @@ def convert_dtypes(

Parameters
----------
input_array : ExtensionArray or PandasArray
input_array : ExtensionArray, Index, Series or np.ndarray
convert_string : bool, default True
Whether object dtypes should be converted to ``StringDtype()``.
convert_integer : bool, default True
Expand Down Expand Up @@ -1250,9 +1286,11 @@ def convert_dtypes(
return inferred_dtype


def maybe_castable(arr) -> bool:
def maybe_castable(arr: np.ndarray) -> bool:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we never get EAs here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Atm the only use case is here:

def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool):
"""
Convert input to numpy ndarray and optionally cast to a given dtype.
Parameters
----------
arr : ndarray, scalar, list, tuple, iterator (catchall)
Excludes: ExtensionArray, Series, Index.
dtype : np.dtype, ExtensionDtype or None
copy : bool
If False, don't copy the data if not needed.
raise_cast_failure : bool
If True, and if a dtype is specified, raise errors during casting.
Otherwise an object array is returned.
"""
# perf shortcut as this is the most common case
if isinstance(arr, np.ndarray):
if maybe_castable(arr) and not copy and dtype is None:
return arr

which according to docstring excludes ExtensionArray

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add an assert in the body itself to lock this down. (whether we want it or not is different, but guarantee is good for now)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added

# return False to force a non-fastpath

assert isinstance(arr, np.ndarray) # GH 37024

# check datetime64[ns]/timedelta64[ns] are valid
# otherwise try to coerce
kind = arr.dtype.kind
Expand All @@ -1264,7 +1302,9 @@ def maybe_castable(arr) -> bool:
return arr.dtype.name not in POSSIBLY_CAST_DTYPES


def maybe_infer_to_datetimelike(value, convert_dates: bool = False):
def maybe_infer_to_datetimelike(
value: Union[ArrayLike, Scalar], convert_dates: bool = False
):
"""
we might have a array (or single object) that is datetime like,
and no dtype is passed don't change the value unless we find a
Expand Down Expand Up @@ -1373,7 +1413,7 @@ def try_timedelta(v):
return value


def maybe_cast_to_datetime(value, dtype, errors: str = "raise"):
def maybe_cast_to_datetime(value, dtype: DtypeObj, errors: str = "raise"):
"""
try to cast the array/value to a datetimelike dtype, converting float
nan to iNaT
Expand Down Expand Up @@ -1566,7 +1606,9 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj:
return np.find_common_type(types, [])


def cast_scalar_to_array(shape, value, dtype: Optional[DtypeObj] = None) -> np.ndarray:
def cast_scalar_to_array(
shape: Tuple, value: Scalar, dtype: Optional[DtypeObj] = None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we just pass shape onto np.empty, https://numpy.org/doc/1.19/reference/generated/numpy.empty.html

so could be int or tuple of int

I recall a discussion about adding an alias to pandas._typing or

Suggested change
shape: Tuple, value: Scalar, dtype: Optional[DtypeObj] = None
shape: Union[int, Tuple[int, ...]], value: Scalar, dtype: Optional[DtypeObj] = None

and the docstring would need to be updated too.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pushed the suggestion.

Adding an alias to _typing sounds like a good idea. Do we already have a PR on that or shall I submit one?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we already have a PR on that or shall I submit one?

I don't think so, go ahead.

) -> np.ndarray:
"""
Create np.ndarray of specified shape and dtype, filled with values.

Expand Down Expand Up @@ -1594,7 +1636,7 @@ def cast_scalar_to_array(shape, value, dtype: Optional[DtypeObj] = None) -> np.n


def construct_1d_arraylike_from_scalar(
value, length: int, dtype: DtypeObj
value: Scalar, length: int, dtype: DtypeObj
) -> ArrayLike:
"""
create a np.ndarray / pandas type of specified shape and dtype
Expand Down Expand Up @@ -1638,7 +1680,7 @@ def construct_1d_arraylike_from_scalar(
return subarr


def construct_1d_object_array_from_listlike(values) -> np.ndarray:
def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
"""
Transform any list-like object in a 1-dimensional numpy array of object
dtype.
Expand All @@ -1664,7 +1706,7 @@ def construct_1d_object_array_from_listlike(values) -> np.ndarray:


def construct_1d_ndarray_preserving_na(
values, dtype: Optional[DtypeObj] = None, copy: bool = False
values: Sequence, dtype: Optional[DtypeObj] = None, copy: bool = False
) -> np.ndarray:
"""
Construct a new ndarray, coercing `values` to `dtype`, preserving NA.
Expand Down Expand Up @@ -1698,7 +1740,7 @@ def construct_1d_ndarray_preserving_na(
return subarr


def maybe_cast_to_integer_array(arr, dtype, copy: bool = False):
def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False):
"""
Takes any dtype and returns the casted version, raising for when data is
incompatible with integer/unsigned integer dtypes.
Expand Down Expand Up @@ -1768,7 +1810,7 @@ def maybe_cast_to_integer_array(arr, dtype, copy: bool = False):
raise ValueError("Trying to coerce float values to integers")


def convert_scalar_for_putitemlike(scalar, dtype: np.dtype):
def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar:
"""
Convert datetimelike scalar if we are setting into a datetime64
or timedelta64 ndarray.
Expand Down Expand Up @@ -1799,7 +1841,7 @@ def convert_scalar_for_putitemlike(scalar, dtype: np.dtype):
return scalar


def validate_numeric_casting(dtype: np.dtype, value):
def validate_numeric_casting(dtype: np.dtype, value: Scalar) -> None:
"""
Check that we can losslessly insert the given value into an array
with the given dtype.
Expand Down