Skip to content

Commit

Permalink
REF: Define extension base classes
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger committed Jan 16, 2018
1 parent 4b06ae4 commit a9e0972
Show file tree
Hide file tree
Showing 4 changed files with 295 additions and 8 deletions.
182 changes: 182 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""An interface for extending pandas with custom arrays."""
import abc
from typing import Tuple, Sequence, Optional, Any # noqa

import numpy as np

_not_implemented_message = "{} does not implement {}."


class ExtensionArray(metaclass=abc.ABCMeta):
"""Abstract base class for custom array types
pandas will recognize instances of this class as proper arrays
with a custom type and will not attempt to coerce them to objects.
Subclasses are expected to implement the following methods.
"""
# ------------------------------------------------------------------------
# Must be a Sequence
# ------------------------------------------------------------------------
@abc.abstractmethod
def __getitem__(self, item):
pass

def __setitem__(self, key, value):
raise NotImplementedError(_not_implemented_message.format(
type(self), '__setitem__')
)

@abc.abstractmethod
def __iter__(self):
pass

@abc.abstractmethod
def __len__(self):
pass

# ------------------------------------------------------------------------
# Required attributes
# ------------------------------------------------------------------------
@property
@abc.abstractmethod
def dtype(self):
# type: () -> ExtensionDtype
pass

@property
def shape(self):
# type: () -> Tuple[int, ...]
return (len(self),)

@property
def ndim(self):
# type: () -> int
"""Extension Arrays are only allowed to be 1-dimensional"""
return 1

@property
@abc.abstractmethod
def nbytes(self):
# type: () -> int
# TODO: default impl?
pass

# ------------------------------------------------------------------------
# Additional Methods
# ------------------------------------------------------------------------
@abc.abstractmethod
def isna(self):
# type: () -> Sequence[bool]
# TODO: narrow this type?
pass

# ------------------------------------------------------------------------
# Indexing methods
# ------------------------------------------------------------------------
@abc.abstractmethod
def take(self, indexer, allow_fill=True, fill_value=None):
# type: (Sequence, bool, Optional[Any]) -> ExtensionArray
"""For slicing"""

@abc.abstractmethod
def take_nd(self, indexer, allow_fill=True, fill_value=None):
"""For slicing"""
# TODO: this isn't nescesary if we only allow 1D (though maybe
# impelment it).

@abc.abstractmethod
def copy(self, deep=False):
# type: (bool) -> ExtensionArray
"""Return a copy of the array."""

# ------------------------------------------------------------------------
# Block-related methods
# ------------------------------------------------------------------------
@property
def _fill_value(self):
"""The missing value for this type, e.g. np.nan"""
# type: () -> Any
return None

@abc.abstractmethod
def _formatting_values(self):
# type: () -> np.ndarray
# At the moment, this has to be an array since we use result.dtype
"""An array of values to be printed in, e.g. the Series repr"""

@classmethod
@abc.abstractmethod
def _concat_same_type(cls, to_concat):
# type: (Sequence[ExtensionArray]) -> ExtensionArray
"""Concatenate multiple array
Parameters
----------
to_concat : sequence of this type
Returns
-------
ExtensionArray
"""

@abc.abstractmethod
def get_values(self):
# type: () -> np.ndarray
"""Get the underlying values backing your data
"""
pass

def _can_hold_na(self):
"""Whether your array can hold missing values. True by default.
Notes
-----
Setting this to false will optimize some operations like fillna.
"""
# type: () -> bool
return True

@property
def is_sparse(self):
"""Whether your array is sparse. True by default."""
# type: () -> bool
return False

@abc.abstractmethod
def _slice(self, slicer):
# type: (Union[tuple, Sequence, int]) -> 'ExtensionArray'
"""Return a new array sliced by `slicer`.
Parameters
----------
slicer : slice or np.ndarray
If an array, it should just be a boolean mask
Returns
-------
array : ExtensionArray
Should return an ExtensionArray, even if ``self[slicer]``
would return a scalar.
"""
# XXX: We could get rid of this *if* we require that
# ExtensionArray(extension_array[x]) always work.
# That seems fine for when extension_array[x] is an ExtensionArray
# but what if extension_array[x] reduces dimensionality?

def value_counts(self, dropna=True):
"""Optional method for computing the histogram of the counts.
Parameters
----------
dropna : bool, default True
whether to exclude missing values from the computation
Returns
-------
counts : Series
"""
from pandas.core.algorithms import value_counts
mask = ~np.asarray(self.isna())
values = self[mask] # XXX: this imposes boolean indexing
return value_counts(np.asarray(values), dropna=dropna)
18 changes: 17 additions & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
from pandas.util._validators import validate_bool_kwarg
from pandas.core.config import get_option

from .base import ExtensionArray


def _cat_compare_op(op):
def f(self, other):
Expand Down Expand Up @@ -149,7 +151,7 @@ def _maybe_to_categorical(array):
"""


class Categorical(PandasObject):
class Categorical(ExtensionArray, PandasObject):
"""
Represents a categorical variable in classic R / S-plus fashion
Expand Down Expand Up @@ -2131,6 +2133,20 @@ def repeat(self, repeats, *args, **kwargs):
return self._constructor(values=codes, categories=self.categories,
ordered=self.ordered, fastpath=True)

# Interface things
# can_hold_na, concat_same_type, formatting_values
@property
def _can_hold_na(self):
return True

@classmethod
def _concat_same_type(self, to_concat):
from pandas.types.concat import union_categoricals
return union_categoricals(to_concat)

def _formatting_values(self):
return self

# The Series.cat accessor


Expand Down
89 changes: 89 additions & 0 deletions pandas/core/dtypes/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Extend pandas with custom array types"""
import abc


class ExtensionDtype(metaclass=abc.ABCMeta):
"""A custom data type for your array.
"""
@property
def type(self):
"""Typically a metaclass inheriting from 'type' with no methods."""
return type(self.name, (), {})

@property
def kind(self):
"""A character code (one of 'biufcmMOSUV'), default 'O'
See Also
--------
numpy.dtype.kind
"""
return 'O'

@property
@abc.abstractmethod
def name(self):
"""An string identifying the data type.
Will be used in, e.g. ``Series.dtype``
"""

@property
def names(self):
"""Ordered list of field names, or None if there are no fields"""
return None

@classmethod
def construct_from_string(cls, string):
"""Attempt to construct this type from a string.
Parameters
----------
string : str
Returns
-------
self : instance of 'cls'
Raises
------
TypeError
Notes
-----
The default implementation checks if 'string' matches your
type's name. If so, it calls your class with no arguments.
"""
if string == cls.name:
return cls()
else:
raise TypeError("Cannot construct a '{}' from "
"'{}'".format(cls, string))

@classmethod
def is_dtype(cls, dtype):
"""Check if we match 'dtype'
Parameters
----------
dtype : str or dtype
Returns
-------
is_dtype : bool
Notes
-----
The default implementation is True if
1. 'dtype' is a string that returns true for
``cls.construct_from_string``
2. 'dtype' is ``cls`` or a subclass of ``cls``.
"""
if isinstance(dtype, str):
try:
return isinstance(cls.construct_from_string(dtype), cls)
except TypeError:
return False
else:
return issubclass(dtype, cls)
14 changes: 7 additions & 7 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
from pandas import compat
from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex

from .base import ExtensionDtype

class ExtensionDtype(object):

class PandasExtensionDtype(ExtensionDtype):
"""
A np.dtype duck-typed class, suitable for holding a custom dtype.
THIS IS NOT A REAL NUMPY DTYPE
"""
name = None
names = None
type = None
subdtype = None
kind = None
Expand Down Expand Up @@ -108,7 +108,7 @@ class CategoricalDtypeType(type):
pass


class CategoricalDtype(ExtensionDtype):
class CategoricalDtype(PandasExtensionDtype):
"""
Type for categorical data with the categories and orderedness
Expand Down Expand Up @@ -387,7 +387,7 @@ class DatetimeTZDtypeType(type):
pass


class DatetimeTZDtype(ExtensionDtype):
class DatetimeTZDtype(PandasExtensionDtype):

"""
A np.dtype duck-typed class, suitable for holding a custom datetime with tz
Expand Down Expand Up @@ -501,7 +501,7 @@ class PeriodDtypeType(type):
pass


class PeriodDtype(ExtensionDtype):
class PeriodDtype(PandasExtensionDtype):
__metaclass__ = PeriodDtypeType
"""
A Period duck-typed class, suitable for holding a period with freq dtype.
Expand Down Expand Up @@ -619,7 +619,7 @@ class IntervalDtypeType(type):
pass


class IntervalDtype(ExtensionDtype):
class IntervalDtype(PandasExtensionDtype):
__metaclass__ = IntervalDtypeType
"""
A Interval duck-typed class, suitable for holding an interval
Expand Down

0 comments on commit a9e0972

Please sign in to comment.