-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
REF: Make PeriodArray an ExtensionArray #22862
Changes from 1 commit
eaadcbc
a05928a
3c0d9ee
63fc3fa
7d5d71c
e5caac6
c194407
eb4506b
1b9fd7a
0fa0ed1
3247ea8
c162cdd
611d378
fb2ff82
25a380f
1b2c4ec
d04293e
eacad39
70cd3b8
9b22889
87d289a
6369c7f
01551f0
0437940
42ab137
298390f
23e5cfc
9d17fd2
959cd72
b66f617
5669675
2c0311c
012be1c
c3a96d0
67faabc
ff7c06c
c2d57bd
fbde770
1c4bbe7
b395c90
d68a5c5
0c7b704
d26d3d2
e4babea
7f6c144
b4aa4ca
6a70131
9aa077c
411738c
8e0fb69
6d98e85
6d9e150
4899479
634def1
1f18452
2f92b22
23f232c
dd3b8cd
1a7c360
87ecb64
0bde329
2d85a82
438e6b5
a9456fd
ac05365
36ed547
4652ca7
a047a1b
a4a30d7
1441ae6
8003808
5f43753
1c13d0f
bae6b3d
f422cf0
0229d74
aa40cf4
29085e1
00ffddf
e81fa9c
0c8925f
96204a1
82930f7
8d24582
1fc7744
6cd428c
21693e0
b65ffad
1f438e3
f3928fb
089f8ab
700650a
452c229
e3e0e57
78751c2
203d561
eb1c67d
e08aa79
c1ee04b
827e563
ca4a7fd
ed185c0
b3407ac
a4011eb
fc1ca3c
1b1841f
b3b315a
3ab4176
8102475
8c329eb
78d4960
4e3d914
5e4aaa7
7f77563
f88d6f7
7aa78ba
2d737f8
833899a
236b49c
8230347
bf33a57
738acfe
032ec02
77e389a
61031d7
a094b3d
ace4856
fc6a1c7
900afcf
0baa3e9
f95106e
e57e24a
ce1c970
a7e1216
2548d6a
02e3863
1997cff
af2d1de
64f5778
4151510
ac9bd41
5462bd7
c1c6428
7ab2736
bd6f966
8068daf
5691506
575d61a
4065bdb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,15 +17,21 @@ | |
from pandas.util._decorators import cache_readonly | ||
|
||
from pandas.core.dtypes.common import ( | ||
is_integer_dtype, is_float_dtype, is_period_dtype) | ||
is_integer_dtype, is_float_dtype, is_period_dtype, | ||
is_float, is_integer, pandas_dtype, is_scalar, | ||
is_datetime64_dtype, | ||
ensure_object | ||
) | ||
from pandas.core.dtypes.dtypes import PeriodDtype | ||
from pandas.core.dtypes.generic import ABCSeries | ||
from pandas.core.dtypes.generic import ABCSeries, ABCIndex | ||
|
||
import pandas.core.common as com | ||
|
||
from pandas.tseries import frequencies | ||
from pandas.tseries.frequencies import get_freq_code as _gfc | ||
from pandas.tseries.offsets import Tick, DateOffset | ||
|
||
from pandas.core.arrays import ExtensionArray | ||
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin | ||
|
||
|
||
|
@@ -49,13 +55,16 @@ def _period_array_cmp(cls, op): | |
|
||
def wrapper(self, other): | ||
op = getattr(self._ndarray_values, opname) | ||
if isinstance(other, (ABCSeries, ABCIndex)): | ||
other = other.values | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought we now tested that this was indeed returning NotImplemented? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's in #23155 (not merged yet, but I think ready to go). We'll need to re-implement / adjust the test, since it uses There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had to add an xfail for |
||
|
||
if isinstance(other, Period): | ||
if other.freq != self.freq: | ||
msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) | ||
raise IncompatibleFrequency(msg) | ||
|
||
result = op(other.ordinal) | ||
elif isinstance(other, PeriodArrayMixin): | ||
elif isinstance(other, PeriodArray): | ||
if other.freq != self.freq: | ||
msg = DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) | ||
raise IncompatibleFrequency(msg) | ||
|
@@ -70,6 +79,9 @@ def wrapper(self, other): | |
elif other is NaT: | ||
result = np.empty(len(self._ndarray_values), dtype=bool) | ||
result.fill(nat_result) | ||
elif isinstance(other, (list, np.ndarray)): | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# XXX: is this correct? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, but this is broken in the status quo, #21793 |
||
return NotImplemented | ||
else: | ||
other = Period(other, freq=self.freq) | ||
result = op(other.ordinal) | ||
|
@@ -82,7 +94,186 @@ def wrapper(self, other): | |
return compat.set_function_name(wrapper, opname, cls) | ||
|
||
|
||
class PeriodArrayMixin(DatetimeLikeArrayMixin): | ||
class PeriodArray(DatetimeLikeArrayMixin, ExtensionArray): | ||
""" | ||
Pandas ExtensionArray for Period data. | ||
|
||
There are two components to a PeriodArray | ||
|
||
- ordinals | ||
- freq | ||
|
||
The values are physically stored as an ndarray of integers. These are | ||
called "ordinals" and represent some kind of offset from a base. | ||
|
||
The `freq` indicates the span covered by each element of the array. | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
All elements in the PeriodArray have the same `freq`. | ||
""" | ||
_attributes = ["freq"] | ||
# -------------------------------------------------------------------- | ||
# Constructors | ||
|
||
def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, | ||
periods=None, tz=None, dtype=None, copy=False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tz very likely doesnt belong There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Part of why the constructors for the existing mixins are bare-bones is because there are comments in the Index subclasses suggesting things like start/end should be taken out of them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I mentioned on the mailing list, I would rather go for a very simple constructor (basically what There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And I think we could then combine There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To maybe clarify what I mean, a small change to IntervalArray: https://github.com/pandas-dev/pandas/compare/master...jorisvandenbossche:intervalarray?expand=1 (needs better naming of course): remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No preference on doing it as part of this PR or another. The diff is already impossible to read, so I may as well try doing it here. |
||
**fields): | ||
from pandas import PeriodIndex, DatetimeIndex, Int64Index | ||
|
||
# copy-pase from PeriodIndex.__new__ with slight adjustments. | ||
# | ||
# - removed all uses of name | ||
valid_field_set = {'year', 'month', 'day', 'quarter', | ||
'hour', 'minute', 'second'} | ||
|
||
if not set(fields).issubset(valid_field_set): | ||
raise TypeError('__new__() got an unexpected keyword argument {}'. | ||
format(list(set(fields) - valid_field_set)[0])) | ||
|
||
if periods is not None: | ||
if is_float(periods): | ||
periods = int(periods) | ||
elif not is_integer(periods): | ||
msg = 'periods must be a number, got {periods}' | ||
raise TypeError(msg.format(periods=periods)) | ||
|
||
if dtype is not None: | ||
dtype = pandas_dtype(dtype) | ||
if not is_period_dtype(dtype): | ||
raise ValueError('dtype must be PeriodDtype') | ||
if freq is None: | ||
freq = dtype.freq | ||
elif freq != dtype.freq: | ||
msg = 'specified freq and dtype are different' | ||
raise IncompatibleFrequency(msg) | ||
|
||
# coerce freq to freq object, otherwise it can be coerced elementwise | ||
# which is slow | ||
if freq: | ||
freq = Period._maybe_convert_freq(freq) | ||
|
||
if data is None: | ||
if ordinal is not None: | ||
data = np.asarray(ordinal, dtype=np.int64) | ||
else: | ||
data, freq = cls._generate_range(start, end, periods, | ||
freq, fields) | ||
return cls._from_ordinals(data, freq=freq) | ||
|
||
if isinstance(data, (PeriodArray, PeriodIndex)): | ||
if freq is None or freq == data.freq: # no freq change | ||
freq = data.freq | ||
data = data._ndarray_values | ||
else: | ||
base1, _ = _gfc(data.freq) | ||
base2, _ = _gfc(freq) | ||
data = libperiod.period_asfreq_arr(data._ndarray_values, | ||
base1, base2, 1) | ||
return cls._simple_new(data, freq=freq) | ||
|
||
# not array / index | ||
if not isinstance(data, (np.ndarray, PeriodIndex, | ||
DatetimeIndex, Int64Index)): | ||
if is_scalar(data) or isinstance(data, Period): | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# XXX | ||
cls._scalar_data_error(data) | ||
|
||
# other iterable of some kind | ||
if not isinstance(data, (list, tuple)): | ||
data = list(data) | ||
|
||
data = np.asarray(data) | ||
|
||
# datetime other than period | ||
if is_datetime64_dtype(data.dtype): | ||
data = dt64arr_to_periodarr(data, freq, tz) | ||
return cls._from_ordinals(data, freq=freq) | ||
|
||
# check not floats | ||
if lib.infer_dtype(data) == 'floating' and len(data) > 0: | ||
raise TypeError("PeriodIndex does not allow " | ||
"floating point in construction") | ||
|
||
# anything else, likely an array of strings or periods | ||
data = ensure_object(data) | ||
freq = freq or libperiod.extract_freq(data) | ||
data = libperiod.extract_ordinals(data, freq) | ||
return cls._from_ordinals(data, freq=freq) | ||
|
||
@property | ||
def asi8(self): | ||
return self._data.view("i8") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this needed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is also duplicated a bit below |
||
|
||
@classmethod | ||
def _from_sequence(cls, scalars, dtype=None, copy=False): | ||
return cls(scalars, dtype=dtype, copy=copy) | ||
|
||
@classmethod | ||
def _from_factorized(cls, values, original): | ||
return cls(values, dtype=original.dtype) | ||
|
||
def __repr__(self): | ||
return '<pandas PeriodArray>\n{}\nLength: {}, dtype: {}'.format( | ||
[str(s) for s in self], | ||
len(self), | ||
self.dtype | ||
) | ||
|
||
def __len__(self): | ||
return len(self._data) | ||
|
||
def isna(self): | ||
return self._data == iNaT | ||
|
||
def take(self, indices, allow_fill=False, fill_value=None): | ||
from pandas.core.algorithms import take | ||
|
||
if fill_value is None: | ||
fill_value = iNaT | ||
elif isinstance(fill_value, Period): | ||
fill_value = fill_value.ordinal | ||
elif fill_value is NaT: | ||
fill_value = iNaT | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. possibly self._fill_value or self._na_value or something? Those attributes exist in a few places but I don't think get used often. |
||
elif fill_value != self.dtype.na_value: | ||
raise ValueError("Expected a Period.") | ||
|
||
new_values = take(self._data, | ||
indices, | ||
allow_fill=allow_fill, | ||
fill_value=fill_value) | ||
|
||
return self._from_ordinals(new_values, self.freq) | ||
|
||
@property | ||
def nbytes(self): | ||
return self._data.nbytes | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def copy(self, deep=False): | ||
return self._from_ordinals(self._data.copy(), freq=self.freq) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, thanks. |
||
|
||
@classmethod | ||
def _concat_same_type(cls, to_concat): | ||
freq = {x.freq for x in to_concat} | ||
assert len(freq) == 1 | ||
freq = list(freq)[0] | ||
values = np.concatenate([x._data for x in to_concat]) | ||
return cls._from_ordinals(values, freq=freq) | ||
|
||
def value_counts(self, dropna=False): | ||
from pandas.core.algorithms import value_counts | ||
from pandas.core.indexes.period import PeriodIndex | ||
|
||
if dropna: | ||
values = self[~self.isna()]._data | ||
else: | ||
values = self._data | ||
|
||
result = value_counts(values) | ||
index = PeriodIndex._from_ordinals(result.index, | ||
name=result.index.name, | ||
freq=self.freq) | ||
return type(result)(result.values, | ||
index=index, | ||
name=result.name) | ||
|
||
@property | ||
def _box_func(self): | ||
return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) | ||
|
@@ -114,21 +305,6 @@ def freq(self, value): | |
FutureWarning, stacklevel=2) | ||
self._freq = value | ||
|
||
# -------------------------------------------------------------------- | ||
# Constructors | ||
|
||
_attributes = ["freq"] | ||
|
||
def __new__(cls, values, freq=None, **kwargs): | ||
if is_period_dtype(values): | ||
# PeriodArray, PeriodIndex | ||
if freq is not None and values.freq != freq: | ||
raise IncompatibleFrequency(freq, values.freq) | ||
freq = values.freq | ||
values = values.asi8 | ||
|
||
return cls._simple_new(values, freq, **kwargs) | ||
|
||
@classmethod | ||
def _simple_new(cls, values, freq=None, **kwargs): | ||
""" | ||
|
@@ -264,7 +440,7 @@ def asfreq(self, freq=None, how='E'): | |
if self.hasnans: | ||
new_data[self._isnan] = iNaT | ||
|
||
return self._simple_new(new_data, self.name, freq=freq) | ||
return self._simple_new(new_data, freq=freq) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch, maybe better to use shallow_copy? |
||
|
||
# ------------------------------------------------------------------ | ||
# Arithmetic Methods | ||
|
@@ -319,7 +495,7 @@ def _add_delta(self, other): | |
ordinal_delta = self._maybe_convert_timedelta(other) | ||
return self.shift(ordinal_delta) | ||
|
||
def shift(self, n): | ||
def shift(self, periods=1): | ||
""" | ||
Specialized shift which produces an Period Array/Index | ||
|
||
|
@@ -332,7 +508,8 @@ def shift(self, n): | |
------- | ||
shifted : Period Array/Index | ||
""" | ||
values = self._ndarray_values + n * self.freq.n | ||
# TODO: ensure we match EA semantics, not PeriodIndex | ||
values = self._ndarray_values + periods * self.freq.n | ||
if self.hasnans: | ||
values[self._isnan] = iNaT | ||
return self._shallow_copy(values=values) | ||
|
@@ -384,9 +561,15 @@ def _maybe_convert_timedelta(self, other): | |
raise IncompatibleFrequency(msg.format(cls=type(self).__name__, | ||
freqstr=self.freqstr)) | ||
|
||
@classmethod | ||
def _scalar_data_error(cls, data): | ||
raise TypeError('{0}(...) must be called with a collection of some ' | ||
'kind, {1} was passed'.format(cls.__name__, | ||
repr(data))) | ||
|
||
|
||
PeriodArrayMixin._add_comparison_ops() | ||
PeriodArrayMixin._add_datetimelike_methods() | ||
PeriodArray._add_comparison_ops() | ||
PeriodArray._add_datetimelike_methods() | ||
|
||
|
||
# ------------------------------------------------------------------- | ||
|
@@ -486,3 +669,12 @@ def _make_field_arrays(*fields): | |
else np.repeat(x, length) for x in fields] | ||
|
||
return arrays | ||
|
||
|
||
def dt64arr_to_periodarr(data, freq, tz): | ||
if data.dtype != np.dtype('M8[ns]'): | ||
raise ValueError('Wrong dtype: %s' % data.dtype) | ||
|
||
freq = Period._maybe_convert_freq(freq) | ||
base, mult = _gfc(freq) | ||
return libperiod.dt64arr_to_periodarr(data.view('i8'), base, tz) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you isort (if you have not done), and remov from the non-checking list
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Going to wait on that since there are other outstanding PRs touching imports.