Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: silently ignoring dtype kwarg in Index.__new__ #38879

Merged
merged 5 commits into from
Jan 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ ExtensionArray

Other
^^^^^

- Bug in :class:`Index` constructor sometimes silently ignorning a a specified ``dtype`` (:issue:`38879`)
-
-

Expand Down
13 changes: 13 additions & 0 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1529,6 +1529,19 @@ def is_extension_array_dtype(arr_or_dtype) -> bool:
return isinstance(dtype, ExtensionDtype) or registry.find(dtype) is not None


def is_ea_or_datetimelike_dtype(dtype: Optional[DtypeObj]) -> bool:
"""
Check for ExtensionDtype, datetime64 dtype, or timedelta64 dtype.
Notes
-----
Checks only for dtype objects, not dtype-castable strings or types.
"""
return isinstance(dtype, ExtensionDtype) or (
isinstance(dtype, np.dtype) and dtype.kind in ["m", "M"]
)


def is_complex_dtype(arr_or_dtype) -> bool:
"""
Check whether the provided array or dtype is of a complex dtype.
Expand Down
142 changes: 54 additions & 88 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
ensure_platform_int,
is_bool_dtype,
is_categorical_dtype,
is_datetime64_any_dtype,
is_dtype_equal,
is_ea_or_datetimelike_dtype,
is_extension_array_dtype,
is_float,
is_float_dtype,
Expand All @@ -56,10 +56,8 @@
is_iterator,
is_list_like,
is_object_dtype,
is_period_dtype,
is_scalar,
is_signed_integer_dtype,
is_timedelta64_dtype,
is_unsigned_integer_dtype,
needs_i8_conversion,
pandas_dtype,
Expand All @@ -69,6 +67,7 @@
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
ExtensionDtype,
IntervalDtype,
PeriodDtype,
)
Expand All @@ -87,6 +86,7 @@
import pandas.core.algorithms as algos
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.arrays.datetimes import tz_to_dtype, validate_tz_from_dtype
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.base import IndexOpsMixin, PandasObject
import pandas.core.common as com
from pandas.core.construction import extract_array
Expand Down Expand Up @@ -286,44 +286,32 @@ def __new__(

# range
if isinstance(data, RangeIndex):
return RangeIndex(start=data, copy=copy, dtype=dtype, name=name)
result = RangeIndex(start=data, copy=copy, name=name)
if dtype is not None:
return result.astype(dtype, copy=False)
return result
elif isinstance(data, range):
return RangeIndex.from_range(data, dtype=dtype, name=name)

# categorical
elif is_categorical_dtype(data_dtype) or is_categorical_dtype(dtype):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
from pandas.core.indexes.category import CategoricalIndex

return _maybe_asobject(dtype, CategoricalIndex, data, copy, name, **kwargs)

# interval
elif is_interval_dtype(data_dtype) or is_interval_dtype(dtype):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
from pandas.core.indexes.interval import IntervalIndex

return _maybe_asobject(dtype, IntervalIndex, data, copy, name, **kwargs)

elif is_datetime64_any_dtype(data_dtype) or is_datetime64_any_dtype(dtype):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
from pandas import DatetimeIndex

return _maybe_asobject(dtype, DatetimeIndex, data, copy, name, **kwargs)

elif is_timedelta64_dtype(data_dtype) or is_timedelta64_dtype(dtype):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
from pandas import TimedeltaIndex

return _maybe_asobject(dtype, TimedeltaIndex, data, copy, name, **kwargs)

elif is_period_dtype(data_dtype) or is_period_dtype(dtype):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
from pandas import PeriodIndex
result = RangeIndex.from_range(data, name=name)
if dtype is not None:
return result.astype(dtype, copy=False)
return result

return _maybe_asobject(dtype, PeriodIndex, data, copy, name, **kwargs)
if is_ea_or_datetimelike_dtype(dtype):
# non-EA dtype indexes have special casting logic, so we punt here
klass = cls._dtype_to_subclass(dtype)
if klass is not Index:
return klass(data, dtype=dtype, copy=copy, name=name, **kwargs)

if is_ea_or_datetimelike_dtype(data_dtype):
klass = cls._dtype_to_subclass(data_dtype)
if klass is not Index:
result = klass(data, copy=copy, name=name, **kwargs)
if dtype is not None:
return result.astype(dtype, copy=False)
return result

# extension dtype
elif is_extension_array_dtype(data_dtype) or is_extension_array_dtype(dtype):
if is_extension_array_dtype(data_dtype) or is_extension_array_dtype(dtype):
if not (dtype is None or is_object_dtype(dtype)):
# coerce to the provided dtype
ea_cls = dtype.construct_array_type()
Expand Down Expand Up @@ -407,26 +395,38 @@ def _ensure_array(cls, data, dtype, copy: bool):
def _dtype_to_subclass(cls, dtype: DtypeObj):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423

if isinstance(dtype, DatetimeTZDtype) or dtype == np.dtype("M8[ns]"):
if isinstance(dtype, ExtensionDtype):
if isinstance(dtype, DatetimeTZDtype):
from pandas import DatetimeIndex

return DatetimeIndex
elif isinstance(dtype, CategoricalDtype):
from pandas import CategoricalIndex

return CategoricalIndex
elif isinstance(dtype, IntervalDtype):
from pandas import IntervalIndex

return IntervalIndex
elif isinstance(dtype, PeriodDtype):
from pandas import PeriodIndex

return PeriodIndex

elif isinstance(dtype, SparseDtype):
return cls._dtype_to_subclass(dtype.subtype)

return Index

if dtype.kind == "M":
from pandas import DatetimeIndex

return DatetimeIndex
elif dtype == "m8[ns]":

elif dtype.kind == "m":
from pandas import TimedeltaIndex

return TimedeltaIndex
elif isinstance(dtype, CategoricalDtype):
from pandas import CategoricalIndex

return CategoricalIndex
elif isinstance(dtype, IntervalDtype):
from pandas import IntervalIndex

return IntervalIndex
elif isinstance(dtype, PeriodDtype):
from pandas import PeriodIndex

return PeriodIndex

elif is_float_dtype(dtype):
from pandas import Float64Index
Expand All @@ -445,6 +445,9 @@ def _dtype_to_subclass(cls, dtype: DtypeObj):
# NB: assuming away MultiIndex
return Index

elif issubclass(dtype.type, (str, bool, np.bool_)):
return Index

raise NotImplementedError(dtype)

"""
Expand Down Expand Up @@ -6253,43 +6256,6 @@ def _try_convert_to_int_array(
raise ValueError


def _maybe_asobject(dtype, klass, data, copy: bool, name: Label, **kwargs):
"""
If an object dtype was specified, create the non-object Index
and then convert it to object.
Parameters
----------
dtype : np.dtype, ExtensionDtype, str
klass : Index subclass
data : list-like
copy : bool
name : hashable
**kwargs
Returns
-------
Index
Notes
-----
We assume that calling .astype(object) on this klass will make a copy.
"""

# GH#23524 passing `dtype=object` to DatetimeIndex is invalid,
# will raise in the where `data` is already tz-aware. So
# we leave it out of this step and cast to object-dtype after
# the DatetimeIndex construction.

if is_dtype_equal(_o_dtype, dtype):
# Note we can pass copy=False because the .astype below
# will always make a copy
index = klass(data, copy=False, name=name, **kwargs)
return index.astype(object)

return klass(data, dtype=dtype, copy=copy, name=name, **kwargs)


def get_unanimous_names(*indexes: Index) -> Tuple[Label, ...]:
"""
Return common name if all indices agree, otherwise None (level-by-level).
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/indexes/ranges/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,6 @@ def test_constructor_range(self):
expected = RangeIndex(1, 5, 2)
tm.assert_index_equal(result, expected, exact=True)

with pytest.raises(
ValueError,
match="Incorrect `dtype` passed: expected signed integer, received float64",
):
Index(range(1, 5, 2), dtype="float64")
msg = r"^from_range\(\) got an unexpected keyword argument"
with pytest.raises(TypeError, match=msg):
RangeIndex.from_range(range(10), copy=True)
Expand Down
78 changes: 78 additions & 0 deletions pandas/tests/indexes/test_index_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,22 @@

from pandas import (
NA,
Categorical,
CategoricalIndex,
DatetimeIndex,
Index,
Int64Index,
IntervalIndex,
MultiIndex,
NaT,
PeriodIndex,
Series,
TimedeltaIndex,
Timestamp,
UInt64Index,
date_range,
period_range,
timedelta_range,
)
import pandas._testing as tm

Expand Down Expand Up @@ -122,6 +126,80 @@ def test_constructor_mixed_nat_objs_infers_object(self, swap_objs):
tm.assert_index_equal(Index(np.array(data, dtype=object)), expected)


class TestDtypeEnforced:
# check we don't silently ignore the dtype keyword

@pytest.mark.parametrize("dtype", [object, "float64", "uint64", "category"])
def test_constructor_range_values_mismatched_dtype(self, dtype):
rng = Index(range(5))

result = Index(rng, dtype=dtype)
assert result.dtype == dtype

result = Index(range(5), dtype=dtype)
assert result.dtype == dtype

@pytest.mark.parametrize("dtype", [object, "float64", "uint64", "category"])
def test_constructor_categorical_values_mismatched_non_ea_dtype(self, dtype):
cat = Categorical([1, 2, 3])

result = Index(cat, dtype=dtype)
assert result.dtype == dtype

def test_constructor_categorical_values_mismatched_dtype(self):
dti = date_range("2016-01-01", periods=3)
cat = Categorical(dti)
result = Index(cat, dti.dtype)
tm.assert_index_equal(result, dti)

dti2 = dti.tz_localize("Asia/Tokyo")
cat2 = Categorical(dti2)
result = Index(cat2, dti2.dtype)
tm.assert_index_equal(result, dti2)

ii = IntervalIndex.from_breaks(range(5))
cat3 = Categorical(ii)
result = Index(cat3, dtype=ii.dtype)
tm.assert_index_equal(result, ii)

def test_constructor_ea_values_mismatched_categorical_dtype(self):
dti = date_range("2016-01-01", periods=3)
result = Index(dti, dtype="category")
expected = CategoricalIndex(dti)
tm.assert_index_equal(result, expected)

dti2 = date_range("2016-01-01", periods=3, tz="US/Pacific")
result = Index(dti2, dtype="category")
expected = CategoricalIndex(dti2)
tm.assert_index_equal(result, expected)

def test_constructor_period_values_mismatched_dtype(self):
pi = period_range("2016-01-01", periods=3, freq="D")
result = Index(pi, dtype="category")
expected = CategoricalIndex(pi)
tm.assert_index_equal(result, expected)

def test_constructor_timedelta64_values_mismatched_dtype(self):
# check we don't silently ignore the dtype keyword
tdi = timedelta_range("4 Days", periods=5)
result = Index(tdi, dtype="category")
expected = CategoricalIndex(tdi)
tm.assert_index_equal(result, expected)

def test_constructor_interval_values_mismatched_dtype(self):
dti = date_range("2016-01-01", periods=3)
ii = IntervalIndex.from_breaks(dti)
result = Index(ii, dtype="category")
expected = CategoricalIndex(ii)
tm.assert_index_equal(result, expected)

def test_constructor_datetime64_values_mismatched_period_dtype(self):
dti = date_range("2016-01-01", periods=3)
result = Index(dti, dtype="Period[D]")
expected = dti.to_period("D")
tm.assert_index_equal(result, expected)


class TestIndexConstructorUnwrapping:
# Test passing different arraylike values to pd.Index

Expand Down