Skip to content

Commit

Permalink
ENH: Add Index.fillna
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks committed Nov 1, 2015
1 parent f44a83a commit bcdb3e9
Show file tree
Hide file tree
Showing 10 changed files with 412 additions and 63 deletions.
25 changes: 25 additions & 0 deletions doc/source/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1367,6 +1367,31 @@ with duplicates dropped.
idx1.sym_diff(idx2)
idx1 ^ idx2
Missing values
~~~~~~~~~~~~~~

.. _indexing.missing:

.. versionadded:: 0.17.1

.. important::

Even though ``Index`` can hold missing values (``NaN``), it should be avoided
if you do not want any unexpected results. For example, some operations
exclude missing values implicitly.

``Index.fillna`` fills missing values with specified scalar value.

.. ipython:: python
idx1 = pd.Index([1, np.nan, 3, 4])
idx1
idx1.fillna(2)
idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')])
idx2
idx2.fillna(pd.Timestamp('2011-01-02'))
Set / Reset Index
-----------------

Expand Down
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v0.17.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ Enhancements
- ``DataFrame`` now uses the fields of a ``namedtuple`` as columns, if columns are not supplied (:issue:`11181`)
- Improve the error message displayed in :func:`pandas.io.gbq.to_gbq` when the DataFrame does not match the schema of the destination table (:issue:`11359`)

- ``Index`` now has ``fillna`` method (:issue:`10089`)

.. ipython:: python

pd.Index([1, np.nan, 3]).fillna(2)

.. _whatsnew_0171.api:

API changes
Expand Down
125 changes: 98 additions & 27 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from pandas.compat import range, zip, lrange, lzip, u, map
from pandas import compat
from pandas.core import algorithms
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, PandasDelegate
import pandas.core.base as base
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
deprecate, deprecate_kwarg)
import pandas.core.common as com
Expand All @@ -29,8 +30,6 @@
from pandas.io.common import PerformanceWarning




# simplify
default_pprint = lambda x, max_seq_items=None: com.pprint_thing(x,
escape_chars=('\t', '\r', '\n'),
Expand All @@ -45,6 +44,7 @@

_index_doc_kwargs = dict(klass='Index', inplace='',
duplicated='np.array')
_index_shared_docs = dict()


def _try_get_item(x):
Expand Down Expand Up @@ -108,6 +108,7 @@ class Index(IndexOpsMixin, PandasObject):
_allow_datetime_index_ops = False
_allow_period_index_ops = False
_is_numeric_dtype = False
_can_hold_na = True

_engine_type = _index.ObjectEngine

Expand Down Expand Up @@ -1236,6 +1237,43 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None):
taken = self.values.take(indices)
return self._shallow_copy(taken)

@cache_readonly
def _isnan(self):
""" return if each value is nan"""
if self._can_hold_na:
return isnull(self)
else:
# shouldn't reach to this condition by checking hasnans beforehand
values = np.empty(len(self), dtype=np.bool_)
values.fill(False)
return values

@cache_readonly
def _nan_idxs(self):
if self._can_hold_na:
w, = self._isnan.nonzero()
return w
else:
return np.array([], dtype=np.int64)

@cache_readonly
def hasnans(self):
""" return if I have any nans; enables various perf speedups """
if self._can_hold_na:
return self._isnan.any()
else:
return False

def _convert_for_op(self, value):
""" Convert value to be insertable to ndarray """
return value

def _assert_can_do_op(self, value):
""" Check value is valid for scalar op """
if not lib.isscalar(value):
msg = "'value' must be a scalar, passed: {0}"
raise TypeError(msg.format(type(value).__name__))

def putmask(self, mask, value):
"""
return a new Index of the values set with the mask
Expand All @@ -1245,8 +1283,12 @@ def putmask(self, mask, value):
numpy.ndarray.putmask
"""
values = self.values.copy()
np.putmask(values, mask, value)
return self._shallow_copy(values)
try:
np.putmask(values, mask, self._convert_for_op(value))
return self._shallow_copy(values)
except (ValueError, TypeError):
# coerces to object
return self.astype(object).putmask(mask, value)

def format(self, name=False, formatter=None, **kwargs):
"""
Expand Down Expand Up @@ -2766,15 +2808,45 @@ def drop(self, labels, errors='raise'):
return self.delete(indexer)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
@Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs)
def drop_duplicates(self, keep='first'):
return super(Index, self).drop_duplicates(keep=keep)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
@Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, keep='first'):
return super(Index, self).duplicated(keep=keep)

_index_shared_docs['fillna'] = """
Fill NA/NaN values with the specified value
Parameters
----------
value : scalar
Scalar value to use to fill holes (e.g. 0).
This value cannot be a list-likes.
downcast : dict, default is None
a dict of item->dtype of what to downcast if possible,
or the string 'infer' which will try to downcast to an appropriate
equal type (e.g. float64 to int64 if possible)
Returns
-------
filled : Index
"""

@Appender(_index_shared_docs['fillna'])
def fillna(self, value=None, downcast=None):
self._assert_can_do_op(value)
if self.hasnans:
result = self.putmask(self._isnan, value)
if downcast is None:
# no need to care metadata other than name
# because it can't have freq if
return Index(result, name=self.name)

return self._shallow_copy()

def _evaluate_with_timedelta_like(self, other, op, opstr):
raise TypeError("can only perform ops with timedelta like values")

Expand Down Expand Up @@ -3200,6 +3272,16 @@ def __array__(self, dtype=None):
""" the array interface, return my values """
return np.array(self._data, dtype=dtype)

@cache_readonly
def _isnan(self):
""" return if each value is nan"""
return self._data.codes == -1

@Appender(_index_shared_docs['fillna'])
def fillna(self, value, downcast=None):
self._assert_can_do_op(value)
return CategoricalIndex(self._data.fillna(value), name=self.name)

def argsort(self, *args, **kwargs):
return self.values.argsort(*args, **kwargs)

Expand All @@ -3214,7 +3296,7 @@ def is_unique(self):
return not self.duplicated().any()

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
@Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, keep='first'):
from pandas.hashtable import duplicated_int64
return duplicated_int64(self.codes.astype('i8'), keep)
Expand Down Expand Up @@ -3612,6 +3694,8 @@ class Int64Index(NumericIndex):
_inner_indexer = _algos.inner_join_indexer_int64
_outer_indexer = _algos.outer_join_indexer_int64

_can_hold_na = False

_engine_type = _index.Int64Engine

def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs):
Expand Down Expand Up @@ -3646,11 +3730,6 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, *
def inferred_type(self):
return 'integer'

@cache_readonly
def hasnans(self):
# by definition
return False

@property
def asi8(self):
# do not cache or you'll create a memory leak
Expand Down Expand Up @@ -3872,19 +3951,6 @@ def is_all_dates(self):
"""
return False

@cache_readonly
def _nan_idxs(self):
w, = self._isnan.nonzero()
return w

@cache_readonly
def _isnan(self):
return np.isnan(self.values)

@cache_readonly
def hasnans(self):
return self._isnan.any()

@cache_readonly
def is_unique(self):
return super(Float64Index, self).is_unique and self._nan_idxs.size < 2
Expand Down Expand Up @@ -4409,7 +4475,7 @@ def is_unique(self):
return not self.duplicated().any()

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
@Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, keep='first'):
from pandas.core.groupby import get_group_index
from pandas.hashtable import duplicated_int64
Expand All @@ -4419,6 +4485,11 @@ def duplicated(self, keep='first'):

return duplicated_int64(ids, keep)

@Appender(_index_shared_docs['fillna'])
def fillna(self, value=None, downcast=None):
# isnull is not implemented for MultiIndex
raise NotImplementedError('isnull is not defined for MultiIndex')

def get_value(self, series, key):
# somewhat broken encapsulation
from pandas.core.indexing import maybe_droplevels
Expand Down
10 changes: 5 additions & 5 deletions pandas/src/period.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ def extract_ordinals(ndarray[object] values, freq):
p = values[i]
ordinals[i] = p.ordinal
if p.freqstr != freqstr:
raise ValueError("%s is wrong freq" % p)
raise ValueError(_DIFFERENT_FREQ_INDEX.format(freqstr, p.freqstr))

return ordinals

Expand Down Expand Up @@ -624,8 +624,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps,
return result


_DIFFERENT_FREQ_ERROR = "Input has different freq={1} from Period(freq={0})"

_DIFFERENT_FREQ = "Input has different freq={1} from Period(freq={0})"
_DIFFERENT_FREQ_INDEX = "Input has different freq={1} from PeriodIndex(freq={0})"

cdef class Period(object):
"""
Expand Down Expand Up @@ -766,7 +766,7 @@ cdef class Period(object):
if isinstance(other, Period):
from pandas.tseries.frequencies import get_freq_code as _gfc
if other.freq != self.freq:
msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr)
msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr)
raise ValueError(msg)
if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT:
return _nat_scalar_rules[op]
Expand Down Expand Up @@ -807,7 +807,7 @@ cdef class Period(object):
else:
ordinal = self.ordinal + other.n
return Period(ordinal=ordinal, freq=self.freq)
msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr)
msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr)
raise ValueError(msg)
else: # pragma no cover
return NotImplemented
Expand Down
Loading

0 comments on commit bcdb3e9

Please sign in to comment.