Skip to content

Commit

Permalink
BUG: silently ignoring dtype kwarg in Index.__new__ (pandas-dev#38879)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and luckyvs1 committed Jan 20, 2021
1 parent c4e9572 commit 1203236
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 94 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ ExtensionArray

Other
^^^^^

- Bug in :class:`Index` constructor sometimes silently ignorning a a specified ``dtype`` (:issue:`38879`)
-
-

Expand Down
13 changes: 13 additions & 0 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1529,6 +1529,19 @@ def is_extension_array_dtype(arr_or_dtype) -> bool:
return isinstance(dtype, ExtensionDtype) or registry.find(dtype) is not None


def is_ea_or_datetimelike_dtype(dtype: Optional[DtypeObj]) -> bool:
"""
Check for ExtensionDtype, datetime64 dtype, or timedelta64 dtype.
Notes
-----
Checks only for dtype objects, not dtype-castable strings or types.
"""
return isinstance(dtype, ExtensionDtype) or (
isinstance(dtype, np.dtype) and dtype.kind in ["m", "M"]
)


def is_complex_dtype(arr_or_dtype) -> bool:
"""
Check whether the provided array or dtype is of a complex dtype.
Expand Down
142 changes: 54 additions & 88 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
ensure_platform_int,
is_bool_dtype,
is_categorical_dtype,
is_datetime64_any_dtype,
is_dtype_equal,
is_ea_or_datetimelike_dtype,
is_extension_array_dtype,
is_float,
is_float_dtype,
Expand All @@ -56,10 +56,8 @@
is_iterator,
is_list_like,
is_object_dtype,
is_period_dtype,
is_scalar,
is_signed_integer_dtype,
is_timedelta64_dtype,
is_unsigned_integer_dtype,
needs_i8_conversion,
pandas_dtype,
Expand All @@ -69,6 +67,7 @@
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
ExtensionDtype,
IntervalDtype,
PeriodDtype,
)
Expand All @@ -87,6 +86,7 @@
import pandas.core.algorithms as algos
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.arrays.datetimes import tz_to_dtype, validate_tz_from_dtype
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.base import IndexOpsMixin, PandasObject
import pandas.core.common as com
from pandas.core.construction import extract_array
Expand Down Expand Up @@ -286,44 +286,32 @@ def __new__(

# range
if isinstance(data, RangeIndex):
return RangeIndex(start=data, copy=copy, dtype=dtype, name=name)
result = RangeIndex(start=data, copy=copy, name=name)
if dtype is not None:
return result.astype(dtype, copy=False)
return result
elif isinstance(data, range):
return RangeIndex.from_range(data, dtype=dtype, name=name)

# categorical
elif is_categorical_dtype(data_dtype) or is_categorical_dtype(dtype):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
from pandas.core.indexes.category import CategoricalIndex

return _maybe_asobject(dtype, CategoricalIndex, data, copy, name, **kwargs)

# interval
elif is_interval_dtype(data_dtype) or is_interval_dtype(dtype):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
from pandas.core.indexes.interval import IntervalIndex

return _maybe_asobject(dtype, IntervalIndex, data, copy, name, **kwargs)

elif is_datetime64_any_dtype(data_dtype) or is_datetime64_any_dtype(dtype):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
from pandas import DatetimeIndex

return _maybe_asobject(dtype, DatetimeIndex, data, copy, name, **kwargs)

elif is_timedelta64_dtype(data_dtype) or is_timedelta64_dtype(dtype):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
from pandas import TimedeltaIndex

return _maybe_asobject(dtype, TimedeltaIndex, data, copy, name, **kwargs)

elif is_period_dtype(data_dtype) or is_period_dtype(dtype):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
from pandas import PeriodIndex
result = RangeIndex.from_range(data, name=name)
if dtype is not None:
return result.astype(dtype, copy=False)
return result

return _maybe_asobject(dtype, PeriodIndex, data, copy, name, **kwargs)
if is_ea_or_datetimelike_dtype(dtype):
# non-EA dtype indexes have special casting logic, so we punt here
klass = cls._dtype_to_subclass(dtype)
if klass is not Index:
return klass(data, dtype=dtype, copy=copy, name=name, **kwargs)

if is_ea_or_datetimelike_dtype(data_dtype):
klass = cls._dtype_to_subclass(data_dtype)
if klass is not Index:
result = klass(data, copy=copy, name=name, **kwargs)
if dtype is not None:
return result.astype(dtype, copy=False)
return result

# extension dtype
elif is_extension_array_dtype(data_dtype) or is_extension_array_dtype(dtype):
if is_extension_array_dtype(data_dtype) or is_extension_array_dtype(dtype):
if not (dtype is None or is_object_dtype(dtype)):
# coerce to the provided dtype
ea_cls = dtype.construct_array_type()
Expand Down Expand Up @@ -407,26 +395,38 @@ def _ensure_array(cls, data, dtype, copy: bool):
def _dtype_to_subclass(cls, dtype: DtypeObj):
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423

if isinstance(dtype, DatetimeTZDtype) or dtype == np.dtype("M8[ns]"):
if isinstance(dtype, ExtensionDtype):
if isinstance(dtype, DatetimeTZDtype):
from pandas import DatetimeIndex

return DatetimeIndex
elif isinstance(dtype, CategoricalDtype):
from pandas import CategoricalIndex

return CategoricalIndex
elif isinstance(dtype, IntervalDtype):
from pandas import IntervalIndex

return IntervalIndex
elif isinstance(dtype, PeriodDtype):
from pandas import PeriodIndex

return PeriodIndex

elif isinstance(dtype, SparseDtype):
return cls._dtype_to_subclass(dtype.subtype)

return Index

if dtype.kind == "M":
from pandas import DatetimeIndex

return DatetimeIndex
elif dtype == "m8[ns]":

elif dtype.kind == "m":
from pandas import TimedeltaIndex

return TimedeltaIndex
elif isinstance(dtype, CategoricalDtype):
from pandas import CategoricalIndex

return CategoricalIndex
elif isinstance(dtype, IntervalDtype):
from pandas import IntervalIndex

return IntervalIndex
elif isinstance(dtype, PeriodDtype):
from pandas import PeriodIndex

return PeriodIndex

elif is_float_dtype(dtype):
from pandas import Float64Index
Expand All @@ -445,6 +445,9 @@ def _dtype_to_subclass(cls, dtype: DtypeObj):
# NB: assuming away MultiIndex
return Index

elif issubclass(dtype.type, (str, bool, np.bool_)):
return Index

raise NotImplementedError(dtype)

"""
Expand Down Expand Up @@ -6253,43 +6256,6 @@ def _try_convert_to_int_array(
raise ValueError


def _maybe_asobject(dtype, klass, data, copy: bool, name: Label, **kwargs):
"""
If an object dtype was specified, create the non-object Index
and then convert it to object.
Parameters
----------
dtype : np.dtype, ExtensionDtype, str
klass : Index subclass
data : list-like
copy : bool
name : hashable
**kwargs
Returns
-------
Index
Notes
-----
We assume that calling .astype(object) on this klass will make a copy.
"""

# GH#23524 passing `dtype=object` to DatetimeIndex is invalid,
# will raise in the where `data` is already tz-aware. So
# we leave it out of this step and cast to object-dtype after
# the DatetimeIndex construction.

if is_dtype_equal(_o_dtype, dtype):
# Note we can pass copy=False because the .astype below
# will always make a copy
index = klass(data, copy=False, name=name, **kwargs)
return index.astype(object)

return klass(data, dtype=dtype, copy=copy, name=name, **kwargs)


def get_unanimous_names(*indexes: Index) -> Tuple[Label, ...]:
"""
Return common name if all indices agree, otherwise None (level-by-level).
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/indexes/ranges/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,6 @@ def test_constructor_range(self):
expected = RangeIndex(1, 5, 2)
tm.assert_index_equal(result, expected, exact=True)

with pytest.raises(
ValueError,
match="Incorrect `dtype` passed: expected signed integer, received float64",
):
Index(range(1, 5, 2), dtype="float64")
msg = r"^from_range\(\) got an unexpected keyword argument"
with pytest.raises(TypeError, match=msg):
RangeIndex.from_range(range(10), copy=True)
Expand Down
78 changes: 78 additions & 0 deletions pandas/tests/indexes/test_index_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,22 @@

from pandas import (
NA,
Categorical,
CategoricalIndex,
DatetimeIndex,
Index,
Int64Index,
IntervalIndex,
MultiIndex,
NaT,
PeriodIndex,
Series,
TimedeltaIndex,
Timestamp,
UInt64Index,
date_range,
period_range,
timedelta_range,
)
import pandas._testing as tm

Expand Down Expand Up @@ -122,6 +126,80 @@ def test_constructor_mixed_nat_objs_infers_object(self, swap_objs):
tm.assert_index_equal(Index(np.array(data, dtype=object)), expected)


class TestDtypeEnforced:
# check we don't silently ignore the dtype keyword

@pytest.mark.parametrize("dtype", [object, "float64", "uint64", "category"])
def test_constructor_range_values_mismatched_dtype(self, dtype):
rng = Index(range(5))

result = Index(rng, dtype=dtype)
assert result.dtype == dtype

result = Index(range(5), dtype=dtype)
assert result.dtype == dtype

@pytest.mark.parametrize("dtype", [object, "float64", "uint64", "category"])
def test_constructor_categorical_values_mismatched_non_ea_dtype(self, dtype):
cat = Categorical([1, 2, 3])

result = Index(cat, dtype=dtype)
assert result.dtype == dtype

def test_constructor_categorical_values_mismatched_dtype(self):
dti = date_range("2016-01-01", periods=3)
cat = Categorical(dti)
result = Index(cat, dti.dtype)
tm.assert_index_equal(result, dti)

dti2 = dti.tz_localize("Asia/Tokyo")
cat2 = Categorical(dti2)
result = Index(cat2, dti2.dtype)
tm.assert_index_equal(result, dti2)

ii = IntervalIndex.from_breaks(range(5))
cat3 = Categorical(ii)
result = Index(cat3, dtype=ii.dtype)
tm.assert_index_equal(result, ii)

def test_constructor_ea_values_mismatched_categorical_dtype(self):
dti = date_range("2016-01-01", periods=3)
result = Index(dti, dtype="category")
expected = CategoricalIndex(dti)
tm.assert_index_equal(result, expected)

dti2 = date_range("2016-01-01", periods=3, tz="US/Pacific")
result = Index(dti2, dtype="category")
expected = CategoricalIndex(dti2)
tm.assert_index_equal(result, expected)

def test_constructor_period_values_mismatched_dtype(self):
pi = period_range("2016-01-01", periods=3, freq="D")
result = Index(pi, dtype="category")
expected = CategoricalIndex(pi)
tm.assert_index_equal(result, expected)

def test_constructor_timedelta64_values_mismatched_dtype(self):
# check we don't silently ignore the dtype keyword
tdi = timedelta_range("4 Days", periods=5)
result = Index(tdi, dtype="category")
expected = CategoricalIndex(tdi)
tm.assert_index_equal(result, expected)

def test_constructor_interval_values_mismatched_dtype(self):
dti = date_range("2016-01-01", periods=3)
ii = IntervalIndex.from_breaks(dti)
result = Index(ii, dtype="category")
expected = CategoricalIndex(ii)
tm.assert_index_equal(result, expected)

def test_constructor_datetime64_values_mismatched_period_dtype(self):
dti = date_range("2016-01-01", periods=3)
result = Index(dti, dtype="Period[D]")
expected = dti.to_period("D")
tm.assert_index_equal(result, expected)


class TestIndexConstructorUnwrapping:
# Test passing different arraylike values to pd.Index

Expand Down

0 comments on commit 1203236

Please sign in to comment.