pandas-dev · TomAugspurger · Feb 2, 2018 · Jan 15, 2018 · Jan 18, 2018 · Jan 18, 2018
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
@@ -1 +1,2 @@
+from .base import ExtensionArray  # noqa
 from .categorical import Categorical  # noqa
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -0,0 +1,191 @@
+"""An interface for extending pandas with custom arrays."""
+import abc
+
+import numpy as np
+
+from pandas.compat import add_metaclass
+
+
+_not_implemented_message = "{} does not implement {}."
+
+
+@add_metaclass(abc.ABCMeta)
+class ExtensionArray(object):
+    """Abstract base class for custom array types
+
+    Notes
+    -----
+    pandas will recognize instances of this class as proper arrays
+    with a custom type and will not attempt to coerce them to objects.
+
+    **Restrictions on your class constructor**
+
+        * Extension arrays should be able to be constructed with instances of
+          the class, i.e. ``ExtensionArray(extension_array)`` should return
+          an instance, not error.
+    """
+    # ------------------------------------------------------------------------
+    # Must be a Sequence
+    # ------------------------------------------------------------------------
+    @abc.abstractmethod
+    def __getitem__(self, item):
+        # type (Any) -> Any
+        """Select a subset of self.
+
+        Notes
+        -----
+        ``item`` may be one of
+
+            * A scalar integer position
+            * A slice object, where 'start', 'stop', and 'step' are
+              integers or None
+            * A 1-d boolean NumPy ndarray the same length as 'self'
+
+        For scalar ``item``, return a scalar value suitable for the array's
+        type. This should be an instance of ``self.dtype.type``.
+
+        For slice ``key``, return an instance of ``ExtensionArray``, even
+        if the slice is length 0 or 1.
+
+        For a boolean mask, return an instance of ``ExtensionArray``, filtered
+        to the values where ``item`` is True.
+        """
+
+    def __setitem__(self, key, value):
+        # type: (Any, Any) -> None
+        raise NotImplementedError(_not_implemented_message.format(
+            type(self), '__setitem__')
+        )
+
+    @abc.abstractmethod
+    def __len__(self):
+        # type: () -> int
+        pass
+
+    # ------------------------------------------------------------------------
+    # Required attributes
+    # ------------------------------------------------------------------------
+    @property
+    @abc.abstractmethod
+    def dtype(self):
+        # type: () -> ExtensionDtype
+        """An instance of 'ExtensionDtype'."""
+
+    @property
+    def shape(self):
+        # type: () -> Tuple[int, ...]
+        return (len(self),)
+
+    @property
+    def ndim(self):
+        # type: () -> int
+        """Extension Arrays are only allowed to be 1-dimensional."""
+        return 1
+
+    @property
+    @abc.abstractmethod
+    def nbytes(self):
+        # type: () -> int
+        """The number of bytes needed to store this object in memory.
+
+        If this is expensive to compute, return an approximate lower bound
+        on the number of bytes needed.
+        """
+
+    # ------------------------------------------------------------------------
+    # Additional Methods
+    # ------------------------------------------------------------------------
+    @abc.abstractmethod
+    def isna(self):
+        # type: () -> np.ndarray
+        """Boolean NumPy array indicating if each value is missing."""
+
+    # ------------------------------------------------------------------------
+    # Indexing methods
+    # ------------------------------------------------------------------------
+    @abc.abstractmethod
+    def take(self, indexer, allow_fill=True, fill_value=None):
+        # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray
+        """Take elements from an array.
+
+        Parameters
+        ----------
+        indexer : sequence of integers
+            indices to be taken. -1 is used to indicate values
+            that are missing.
+        allow_fill : bool, default True
+            If False, indexer is assumed to contain no -1 values so no filling
+            will be done. This short-circuits computation of a mask. Result is
+            undefined if allow_fill == False and -1 is present in indexer.
+        fill_value : any, default None
+            Fill value to replace -1 values with. By default, this uses
+            the missing value sentinel for this type, ``self._fill_value``.
+
+        Notes
+        -----
+        This should follow pandas' semantics where -1 indicates missing values.
+        Positions where indexer is ``-1`` should be filled with the missing
+        value for this type.
+
+        This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the
+        indexer is a sequence of values.
+
+        Examples
+        --------
+        Suppose the extension array somehow backed by a NumPy structured array
+        and that the underlying structured array is stored as ``self.data``.
+        Then ``take`` may be written as
+
+        .. code-block:: python
+
+           def take(self, indexer, allow_fill=True, fill_value=None):
+               mask = indexer == -1
+               result = self.data.take(indexer)
+               result[mask] = self._fill_value
+               return type(self)(result)
+        """
+
+    @abc.abstractmethod
+    def copy(self, deep=False):
+        # type: (bool) -> ExtensionArray
+        """Return a copy of the array."""
+
+    # ------------------------------------------------------------------------
+    # Block-related methods
+    # ------------------------------------------------------------------------
+    @property
+    def _fill_value(self):
+        # type: () -> Any
+        """The missing value for this type, e.g. np.nan"""
+        return None
+
+    @abc.abstractmethod
+    def _formatting_values(self):
+        # type: () -> np.ndarray
+        # At the moment, this has to be an array since we use result.dtype
+        """An array of values to be printed in, e.g. the Series repr"""
+
+    @classmethod
+    @abc.abstractmethod
+    def _concat_same_type(cls, to_concat):
+        # type: (Sequence[ExtensionArray]) -> ExtensionArray
+        """Concatenate multiple array
+
+        Parameters
+        ----------
+        to_concat : sequence of this type
+
+        Returns
+        -------
+        ExtensionArray
+        """
+
+    def _can_hold_na(self):
+        # type: () -> bool
+        """Whether your array can hold missing values. True by default.
+
+        Notes
+        -----
+        Setting this to false will optimize some operations like fillna.
+        """
+        return True
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -43,6 +43,8 @@
 from pandas.util._validators import validate_bool_kwarg
 from pandas.core.config import get_option
 
+from .base import ExtensionArray
+
 
 def _cat_compare_op(op):
     def f(self, other):
@@ -148,7 +150,7 @@ def _maybe_to_categorical(array):
 """
 
 
-class Categorical(PandasObject):
+class Categorical(ExtensionArray, PandasObject):
     """
     Represents a categorical variable in classic R / S-plus fashion
 
@@ -2130,6 +2132,21 @@ def repeat(self, repeats, *args, **kwargs):
         return self._constructor(values=codes, categories=self.categories,
                                  ordered=self.ordered, fastpath=True)
 
+    # Interface things
+    # can_hold_na, concat_same_type, formatting_values
+    @property
+    def _can_hold_na(self):
+        return True
+
+    @classmethod
+    def _concat_same_type(self, to_concat):
+        from pandas.core.dtypes.concat import _concat_categorical
+
+        return _concat_categorical(to_concat)
+
+    def _formatting_values(self):
+        return self
+
 # The Series.cat accessor
 
 

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -0,0 +1,119 @@
+"""Extend pandas with custom array types"""
+import abc
+
+from pandas.compat import add_metaclass
+
+
+@add_metaclass(abc.ABCMeta)
+class ExtensionDtype(object):
+    """A custom data type for your array.
+    """
+
+    def __str__(self):
+        return self.name
+
+    @property
+    @abc.abstractmethod
+    def type(self):
+        # type: () -> type
+        """The scalar type for your array, e.g. ``int``
+
+        It's expected ``ExtensionArray[item]`` returns an instance
+        of ``ExtensionDtype.type`` for scalar ``item``.
+        """
+
+    @property
+    def kind(self):
+        # type () -> str
+        """A character code (one of 'biufcmMOSUV'), default 'O'
+
+        This should match the NumPy dtype used when your array is
+        converted to an ndarray, which is probably 'O' for object if
+        your extension type cannot be represented as a built-in NumPy
+        type.
+
+        See Also
+        --------
+        numpy.dtype.kind
+        """
+        return 'O'
+
+    @property
+    @abc.abstractmethod
+    def name(self):
+        # type: () -> str
+        """A string identifying the data type.
+
+        Will be used for display in, e.g. ``Series.dtype``
+        """
+
+    @property
+    def names(self):
+        # type: () -> Optional[List[str]]
+        """Ordered list of field names, or None if there are no fields."""
+        return None
+
+    @classmethod
+    @abc.abstractmethod
+    def construct_from_string(cls, string):
+        """Attempt to construct this type from a string.
+
+        Parameters
+        ----------
+        string : str
+
+        Returns
+        -------
+        self : instance of 'cls'
+
+        Raises
+        ------
+        TypeError
+            If a class cannot be constructed from this 'string'.
+
+        Notes
+        -----
+        The default implementation checks if 'string' matches your
+        type's name. If so, it calls your class with no arguments.
+
+        Examples
+        --------
+        If the extension dtype can be constructed without any arguments,
+        the following may be an adequate implementation.
+
+        >>> @classmethod
+        ... def construct_from_string(cls, string)
+        ...     if string == cls.name:
+        ...         return cls()
+        ...     else:
+        ...         raise TypeError("Cannot construct a '{}' from "
+        ...                         "'{}'".format(cls, string))
+        """
+
+    @classmethod
+    def is_dtype(cls, dtype):
+        """Check if we match 'dtype'
+
+        Parameters
+        ----------
+        dtype : str or dtype
+
+        Returns
+        -------
+        is_dtype : bool
+
+        Notes
+        -----
+        The default implementation is True if
+
+        1. ``cls.construct_from_string(dtype)`` is an instance
+           of ``cls``.
+        2. 'dtype' is ``cls`` or a subclass of ``cls``.
+        """
+        if isinstance(dtype, str):
+            try:
+                return isinstance(cls.construct_from_string(dtype), cls)
+            except TypeError:
+                return False
+        else:
+            return issubclass(dtype, cls)
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1685,6 +1685,38 @@ def is_extension_type(arr):
     return False
 
 
+def is_extension_array_dtype(arr_or_dtype):
+    """Check if an object is a pandas extension array type
+
+    Parameters
+    ----------
+    arr_or_dtype : object
+
+    Returns
+    -------
+    bool
+
+    Notes
+    -----
+    This checks whether an object implements the pandas extension
+    array interface. In pandas, this includes:
+
+    * Categorical
+    * PeriodArray
+    * IntervalArray
+    * SparseArray
+
+    Third-party libraries may implement arrays or types satisfying
+    this interface as well.
+    """
+    from pandas.core.arrays import ExtensionArray
+
+    # we want to unpack series, anything else?
+    if isinstance(arr_or_dtype, ABCSeries):
+        arr_or_dtype = arr_or_dtype._values
+    return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray))
+
+
 def is_complex_dtype(arr_or_dtype):
     """
     Check whether the provided array or dtype is of a complex dtype.
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .base import ExtensionArray # noqa
		from .categorical import Categorical # noqa