-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4b06ae4
commit a9e0972
Showing
4 changed files
with
295 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
"""An interface for extending pandas with custom arrays.""" | ||
import abc | ||
from typing import Tuple, Sequence, Optional, Any # noqa | ||
|
||
import numpy as np | ||
|
||
_not_implemented_message = "{} does not implement {}." | ||
|
||
|
||
class ExtensionArray(metaclass=abc.ABCMeta): | ||
"""Abstract base class for custom array types | ||
pandas will recognize instances of this class as proper arrays | ||
with a custom type and will not attempt to coerce them to objects. | ||
Subclasses are expected to implement the following methods. | ||
""" | ||
# ------------------------------------------------------------------------ | ||
# Must be a Sequence | ||
# ------------------------------------------------------------------------ | ||
@abc.abstractmethod | ||
def __getitem__(self, item): | ||
pass | ||
|
||
def __setitem__(self, key, value): | ||
raise NotImplementedError(_not_implemented_message.format( | ||
type(self), '__setitem__') | ||
) | ||
|
||
@abc.abstractmethod | ||
def __iter__(self): | ||
pass | ||
|
||
@abc.abstractmethod | ||
def __len__(self): | ||
pass | ||
|
||
# ------------------------------------------------------------------------ | ||
# Required attributes | ||
# ------------------------------------------------------------------------ | ||
@property | ||
@abc.abstractmethod | ||
def dtype(self): | ||
# type: () -> ExtensionDtype | ||
pass | ||
|
||
@property | ||
def shape(self): | ||
# type: () -> Tuple[int, ...] | ||
return (len(self),) | ||
|
||
@property | ||
def ndim(self): | ||
# type: () -> int | ||
"""Extension Arrays are only allowed to be 1-dimensional""" | ||
return 1 | ||
|
||
@property | ||
@abc.abstractmethod | ||
def nbytes(self): | ||
# type: () -> int | ||
# TODO: default impl? | ||
pass | ||
|
||
# ------------------------------------------------------------------------ | ||
# Additional Methods | ||
# ------------------------------------------------------------------------ | ||
@abc.abstractmethod | ||
def isna(self): | ||
# type: () -> Sequence[bool] | ||
# TODO: narrow this type? | ||
pass | ||
|
||
# ------------------------------------------------------------------------ | ||
# Indexing methods | ||
# ------------------------------------------------------------------------ | ||
@abc.abstractmethod | ||
def take(self, indexer, allow_fill=True, fill_value=None): | ||
# type: (Sequence, bool, Optional[Any]) -> ExtensionArray | ||
"""For slicing""" | ||
|
||
@abc.abstractmethod | ||
def take_nd(self, indexer, allow_fill=True, fill_value=None): | ||
"""For slicing""" | ||
# TODO: this isn't nescesary if we only allow 1D (though maybe | ||
# impelment it). | ||
|
||
@abc.abstractmethod | ||
def copy(self, deep=False): | ||
# type: (bool) -> ExtensionArray | ||
"""Return a copy of the array.""" | ||
|
||
# ------------------------------------------------------------------------ | ||
# Block-related methods | ||
# ------------------------------------------------------------------------ | ||
@property | ||
def _fill_value(self): | ||
"""The missing value for this type, e.g. np.nan""" | ||
# type: () -> Any | ||
return None | ||
|
||
@abc.abstractmethod | ||
def _formatting_values(self): | ||
# type: () -> np.ndarray | ||
# At the moment, this has to be an array since we use result.dtype | ||
"""An array of values to be printed in, e.g. the Series repr""" | ||
|
||
@classmethod | ||
@abc.abstractmethod | ||
def _concat_same_type(cls, to_concat): | ||
# type: (Sequence[ExtensionArray]) -> ExtensionArray | ||
"""Concatenate multiple array | ||
Parameters | ||
---------- | ||
to_concat : sequence of this type | ||
Returns | ||
------- | ||
ExtensionArray | ||
""" | ||
|
||
@abc.abstractmethod | ||
def get_values(self): | ||
# type: () -> np.ndarray | ||
"""Get the underlying values backing your data | ||
""" | ||
pass | ||
|
||
def _can_hold_na(self): | ||
"""Whether your array can hold missing values. True by default. | ||
Notes | ||
----- | ||
Setting this to false will optimize some operations like fillna. | ||
""" | ||
# type: () -> bool | ||
return True | ||
|
||
@property | ||
def is_sparse(self): | ||
"""Whether your array is sparse. True by default.""" | ||
# type: () -> bool | ||
return False | ||
|
||
@abc.abstractmethod | ||
def _slice(self, slicer): | ||
# type: (Union[tuple, Sequence, int]) -> 'ExtensionArray' | ||
"""Return a new array sliced by `slicer`. | ||
Parameters | ||
---------- | ||
slicer : slice or np.ndarray | ||
If an array, it should just be a boolean mask | ||
Returns | ||
------- | ||
array : ExtensionArray | ||
Should return an ExtensionArray, even if ``self[slicer]`` | ||
would return a scalar. | ||
""" | ||
# XXX: We could get rid of this *if* we require that | ||
# ExtensionArray(extension_array[x]) always work. | ||
# That seems fine for when extension_array[x] is an ExtensionArray | ||
# but what if extension_array[x] reduces dimensionality? | ||
|
||
def value_counts(self, dropna=True): | ||
"""Optional method for computing the histogram of the counts. | ||
Parameters | ||
---------- | ||
dropna : bool, default True | ||
whether to exclude missing values from the computation | ||
Returns | ||
------- | ||
counts : Series | ||
""" | ||
from pandas.core.algorithms import value_counts | ||
mask = ~np.asarray(self.isna()) | ||
values = self[mask] # XXX: this imposes boolean indexing | ||
return value_counts(np.asarray(values), dropna=dropna) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
"""Extend pandas with custom array types""" | ||
import abc | ||
|
||
|
||
class ExtensionDtype(metaclass=abc.ABCMeta): | ||
"""A custom data type for your array. | ||
""" | ||
@property | ||
def type(self): | ||
"""Typically a metaclass inheriting from 'type' with no methods.""" | ||
return type(self.name, (), {}) | ||
|
||
@property | ||
def kind(self): | ||
"""A character code (one of 'biufcmMOSUV'), default 'O' | ||
See Also | ||
-------- | ||
numpy.dtype.kind | ||
""" | ||
return 'O' | ||
|
||
@property | ||
@abc.abstractmethod | ||
def name(self): | ||
"""An string identifying the data type. | ||
Will be used in, e.g. ``Series.dtype`` | ||
""" | ||
|
||
@property | ||
def names(self): | ||
"""Ordered list of field names, or None if there are no fields""" | ||
return None | ||
|
||
@classmethod | ||
def construct_from_string(cls, string): | ||
"""Attempt to construct this type from a string. | ||
Parameters | ||
---------- | ||
string : str | ||
Returns | ||
------- | ||
self : instance of 'cls' | ||
Raises | ||
------ | ||
TypeError | ||
Notes | ||
----- | ||
The default implementation checks if 'string' matches your | ||
type's name. If so, it calls your class with no arguments. | ||
""" | ||
if string == cls.name: | ||
return cls() | ||
else: | ||
raise TypeError("Cannot construct a '{}' from " | ||
"'{}'".format(cls, string)) | ||
|
||
@classmethod | ||
def is_dtype(cls, dtype): | ||
"""Check if we match 'dtype' | ||
Parameters | ||
---------- | ||
dtype : str or dtype | ||
Returns | ||
------- | ||
is_dtype : bool | ||
Notes | ||
----- | ||
The default implementation is True if | ||
1. 'dtype' is a string that returns true for | ||
``cls.construct_from_string`` | ||
2. 'dtype' is ``cls`` or a subclass of ``cls``. | ||
""" | ||
if isinstance(dtype, str): | ||
try: | ||
return isinstance(cls.construct_from_string(dtype), cls) | ||
except TypeError: | ||
return False | ||
else: | ||
return issubclass(dtype, cls) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters