ENH: Add Index.fillna

pandas-dev · Nov 1, 2015 · bcdb3e9 · bcdb3e9
1 parent f44a83a
commit bcdb3e9
Show file tree

Hide file tree

Showing 10 changed files with 412 additions and 63 deletions.
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -1367,6 +1367,31 @@ with duplicates dropped.
    idx1.sym_diff(idx2)
    idx1 ^ idx2
 
+Missing values
+~~~~~~~~~~~~~~
+
+.. _indexing.missing:
+
+.. versionadded:: 0.17.1
+
+.. important::
+
+   Even though ``Index`` can hold missing values (``NaN``), it should be avoided
+   if you do not want any unexpected results. For example, some operations
+   exclude missing values implicitly.
+
+``Index.fillna`` fills missing values with specified scalar value.
+
+.. ipython:: python
+
+   idx1 = pd.Index([1, np.nan, 3, 4])
+   idx1
+   idx1.fillna(2)
+
+   idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')])
+   idx2
+   idx2.fillna(pd.Timestamp('2011-01-02'))
+
 Set / Reset Index
 -----------------
 

diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt
@@ -26,6 +26,12 @@ Enhancements
 - ``DataFrame`` now uses the fields of a ``namedtuple`` as columns, if columns are not supplied (:issue:`11181`)
 - Improve the error message displayed in :func:`pandas.io.gbq.to_gbq` when the DataFrame does not match the schema of the destination table (:issue:`11359`)
 
+- ``Index`` now has ``fillna`` method (:issue:`10089`)
+
+.. ipython:: python
+
+   pd.Index([1, np.nan, 3]).fillna(2)
+
 .. _whatsnew_0171.api:
 
 API changes

diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -15,7 +15,8 @@
 from pandas.compat import range, zip, lrange, lzip, u, map
 from pandas import compat
 from pandas.core import algorithms
-from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
+from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, PandasDelegate
+import pandas.core.base as base
 from pandas.util.decorators import (Appender, Substitution, cache_readonly,
                                     deprecate, deprecate_kwarg)
 import pandas.core.common as com
@@ -29,8 +30,6 @@
 from pandas.io.common import PerformanceWarning
 
 
-
-
 # simplify
 default_pprint = lambda x, max_seq_items=None: com.pprint_thing(x,
                                                                 escape_chars=('\t', '\r', '\n'),
@@ -45,6 +44,7 @@
 
 _index_doc_kwargs = dict(klass='Index', inplace='',
                          duplicated='np.array')
+_index_shared_docs = dict()
 
 
 def _try_get_item(x):
@@ -108,6 +108,7 @@ class Index(IndexOpsMixin, PandasObject):
     _allow_datetime_index_ops = False
     _allow_period_index_ops = False
     _is_numeric_dtype = False
+    _can_hold_na = True
 
     _engine_type = _index.ObjectEngine
 
@@ -1236,6 +1237,43 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None):
         taken = self.values.take(indices)
         return self._shallow_copy(taken)
 
+    @cache_readonly
+    def _isnan(self):
+        """ return if each value is nan"""
+        if self._can_hold_na:
+            return isnull(self)
+        else:
+            # shouldn't reach to this condition by checking hasnans beforehand
+            values = np.empty(len(self), dtype=np.bool_)
+            values.fill(False)
+            return values
+
+    @cache_readonly
+    def _nan_idxs(self):
+        if self._can_hold_na:
+            w, = self._isnan.nonzero()
+            return w
+        else:
+            return np.array([], dtype=np.int64)
+
+    @cache_readonly
+    def hasnans(self):
+        """ return if I have any nans; enables various perf speedups """
+        if self._can_hold_na:
+            return self._isnan.any()
+        else:
+            return False
+
+    def _convert_for_op(self, value):
+        """ Convert value to be insertable to ndarray """
+        return value
+
+    def _assert_can_do_op(self, value):
+        """ Check value is valid for scalar op """
+        if not lib.isscalar(value):
+            msg = "'value' must be a scalar, passed: {0}"
+            raise TypeError(msg.format(type(value).__name__))
+
     def putmask(self, mask, value):
         """
         return a new Index of the values set with the mask
@@ -1245,8 +1283,12 @@ def putmask(self, mask, value):
         numpy.ndarray.putmask
         """
         values = self.values.copy()
-        np.putmask(values, mask, value)
-        return self._shallow_copy(values)
+        try:
+            np.putmask(values, mask, self._convert_for_op(value))
+            return self._shallow_copy(values)
+        except (ValueError, TypeError):
+            # coerces to object
+            return self.astype(object).putmask(mask, value)
 
     def format(self, name=False, formatter=None, **kwargs):
         """
@@ -2766,15 +2808,45 @@ def drop(self, labels, errors='raise'):
         return self.delete(indexer)
 
     @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
-    @Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
+    @Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs)
     def drop_duplicates(self, keep='first'):
         return super(Index, self).drop_duplicates(keep=keep)
 
     @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
-    @Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
+    @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
     def duplicated(self, keep='first'):
         return super(Index, self).duplicated(keep=keep)
 
+    _index_shared_docs['fillna'] = """
+        Fill NA/NaN values with the specified value
+
+        Parameters
+        ----------
+        value : scalar
+            Scalar value to use to fill holes (e.g. 0).
+            This value cannot be a list-likes.
+        downcast : dict, default is None
+            a dict of item->dtype of what to downcast if possible,
+            or the string 'infer' which will try to downcast to an appropriate
+            equal type (e.g. float64 to int64 if possible)
+
+        Returns
+        -------
+        filled : Index
+        """
+
+    @Appender(_index_shared_docs['fillna'])
+    def fillna(self, value=None, downcast=None):
+        self._assert_can_do_op(value)
+        if self.hasnans:
+            result = self.putmask(self._isnan, value)
+            if downcast is None:
+                # no need to care metadata other than name
+                # because it can't have freq if
+                return Index(result, name=self.name)
+
+        return self._shallow_copy()
+
     def _evaluate_with_timedelta_like(self, other, op, opstr):
         raise TypeError("can only perform ops with timedelta like values")
 
@@ -3200,6 +3272,16 @@ def __array__(self, dtype=None):
         """ the array interface, return my values """
         return np.array(self._data, dtype=dtype)
 
+    @cache_readonly
+    def _isnan(self):
+        """ return if each value is nan"""
+        return self._data.codes == -1
+
+    @Appender(_index_shared_docs['fillna'])
+    def fillna(self, value, downcast=None):
+        self._assert_can_do_op(value)
+        return CategoricalIndex(self._data.fillna(value), name=self.name)
+
     def argsort(self, *args, **kwargs):
         return self.values.argsort(*args, **kwargs)
 
@@ -3214,7 +3296,7 @@ def is_unique(self):
         return not self.duplicated().any()
 
     @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
-    @Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
+    @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
     def duplicated(self, keep='first'):
         from pandas.hashtable import duplicated_int64
         return duplicated_int64(self.codes.astype('i8'), keep)
@@ -3612,6 +3694,8 @@ class Int64Index(NumericIndex):
     _inner_indexer = _algos.inner_join_indexer_int64
     _outer_indexer = _algos.outer_join_indexer_int64
 
+    _can_hold_na = False
+
     _engine_type = _index.Int64Engine
 
     def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs):
@@ -3646,11 +3730,6 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, *
     def inferred_type(self):
         return 'integer'
 
-    @cache_readonly
-    def hasnans(self):
-        # by definition
-        return False
-
     @property
     def asi8(self):
         # do not cache or you'll create a memory leak
@@ -3872,19 +3951,6 @@ def is_all_dates(self):
         """
         return False
 
-    @cache_readonly
-    def _nan_idxs(self):
-        w, = self._isnan.nonzero()
-        return w
-
-    @cache_readonly
-    def _isnan(self):
-        return np.isnan(self.values)
-
-    @cache_readonly
-    def hasnans(self):
-        return self._isnan.any()
-
     @cache_readonly
     def is_unique(self):
         return super(Float64Index, self).is_unique and self._nan_idxs.size < 2
@@ -4409,7 +4475,7 @@ def is_unique(self):
         return not self.duplicated().any()
 
     @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
-    @Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
+    @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
     def duplicated(self, keep='first'):
         from pandas.core.groupby import get_group_index
         from pandas.hashtable import duplicated_int64
@@ -4419,6 +4485,11 @@ def duplicated(self, keep='first'):
 
         return duplicated_int64(ids, keep)
 
+    @Appender(_index_shared_docs['fillna'])
+    def fillna(self, value=None, downcast=None):
+        # isnull is not implemented for MultiIndex
+        raise NotImplementedError('isnull is not defined for MultiIndex')
+
     def get_value(self, series, key):
         # somewhat broken encapsulation
         from pandas.core.indexing import maybe_droplevels

diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx
@@ -452,7 +452,7 @@ def extract_ordinals(ndarray[object] values, freq):
         p = values[i]
         ordinals[i] = p.ordinal
         if p.freqstr != freqstr:
-            raise ValueError("%s is wrong freq" % p)
+            raise ValueError(_DIFFERENT_FREQ_INDEX.format(freqstr, p.freqstr))
 
     return ordinals
 
@@ -624,8 +624,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps,
     return result
 
 
-_DIFFERENT_FREQ_ERROR = "Input has different freq={1} from Period(freq={0})"
-
+_DIFFERENT_FREQ = "Input has different freq={1} from Period(freq={0})"
+_DIFFERENT_FREQ_INDEX = "Input has different freq={1} from PeriodIndex(freq={0})"
 
 cdef class Period(object):
     """
@@ -766,7 +766,7 @@ cdef class Period(object):
         if isinstance(other, Period):
             from pandas.tseries.frequencies import get_freq_code as _gfc
             if other.freq != self.freq:
-                msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr)
+                msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr)
                 raise ValueError(msg)
             if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT:
                 return _nat_scalar_rules[op]
@@ -807,7 +807,7 @@ cdef class Period(object):
                 else:
                     ordinal = self.ordinal + other.n
                 return Period(ordinal=ordinal, freq=self.freq)
-            msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr)
+            msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr)
             raise ValueError(msg)
         else: # pragma no cover
             return NotImplemented