Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add Index.fillna #11343

Merged
merged 1 commit into from
Nov 13, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions doc/source/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1367,6 +1367,31 @@ with duplicates dropped.
idx1.sym_diff(idx2)
idx1 ^ idx2

Missing values
~~~~~~~~~~~~~~

.. _indexing.missing:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you put this above the title?


.. versionadded:: 0.17.1

.. important::

Even though ``Index`` can hold missing values (``NaN``), it should be avoided
if you do not want any unexpected results. For example, some operations
exclude missing values implicitly.

``Index.fillna`` fills missing values with specified scalar value.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you make this a link to the API docs with :meth:Index.fillna`` ?


.. ipython:: python

idx1 = pd.Index([1, np.nan, 3, 4])
idx1
idx1.fillna(2)

idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')])
idx2
idx2.fillna(pd.Timestamp('2011-01-02'))

Set / Reset Index
-----------------

Expand Down
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v0.17.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ Enhancements
- ``DataFrame`` now uses the fields of a ``namedtuple`` as columns, if columns are not supplied (:issue:`11181`)
- Improve the error message displayed in :func:`pandas.io.gbq.to_gbq` when the DataFrame does not match the schema of the destination table (:issue:`11359`)

- ``Index`` now has ``fillna`` method (:issue:`10089`)

.. ipython:: python

pd.Index([1, np.nan, 3]).fillna(2)

.. _whatsnew_0171.api:

API changes
Expand Down
125 changes: 98 additions & 27 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from pandas.compat import range, zip, lrange, lzip, u, map
from pandas import compat
from pandas.core import algorithms
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, PandasDelegate
import pandas.core.base as base
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
deprecate, deprecate_kwarg)
import pandas.core.common as com
Expand All @@ -29,8 +30,6 @@
from pandas.io.common import PerformanceWarning




# simplify
default_pprint = lambda x, max_seq_items=None: com.pprint_thing(x,
escape_chars=('\t', '\r', '\n'),
Expand All @@ -45,6 +44,7 @@

_index_doc_kwargs = dict(klass='Index', inplace='',
duplicated='np.array')
_index_shared_docs = dict()


def _try_get_item(x):
Expand Down Expand Up @@ -108,6 +108,7 @@ class Index(IndexOpsMixin, PandasObject):
_allow_datetime_index_ops = False
_allow_period_index_ops = False
_is_numeric_dtype = False
_can_hold_na = True

_engine_type = _index.ObjectEngine

Expand Down Expand Up @@ -1236,6 +1237,43 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None):
taken = self.values.take(indices)
return self._shallow_copy(taken)

@cache_readonly
def _isnan(self):
""" return if each value is nan"""
if self._can_hold_na:
return isnull(self)
else:
# shouldn't reach to this condition by checking hasnans beforehand
values = np.empty(len(self), dtype=np.bool_)
values.fill(False)
return values

@cache_readonly
def _nan_idxs(self):
if self._can_hold_na:
w, = self._isnan.nonzero()
return w
else:
return np.array([], dtype=np.int64)

@cache_readonly
def hasnans(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is override core/base.py, but do we use that anylonger? (I don't think Series uses this)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't notice base.py. Should remove it from base.py because Series can be changed inplace?

""" return if I have any nans; enables various perf speedups """
if self._can_hold_na:
return self._isnan.any()
else:
return False

def _convert_for_op(self, value):
""" Convert value to be insertable to ndarray """
return value

def _assert_can_do_op(self, value):
""" Check value is valid for scalar op """
if not lib.isscalar(value):
msg = "'value' must be a scalar, passed: {0}"
raise TypeError(msg.format(type(value).__name__))

def putmask(self, mask, value):
"""
return a new Index of the values set with the mask
Expand All @@ -1245,8 +1283,12 @@ def putmask(self, mask, value):
numpy.ndarray.putmask
"""
values = self.values.copy()
np.putmask(values, mask, value)
return self._shallow_copy(values)
try:
np.putmask(values, mask, self._convert_for_op(value))
return self._shallow_copy(values)
except (ValueError, TypeError):
# coerces to object
return self.astype(object).putmask(mask, value)

def format(self, name=False, formatter=None, **kwargs):
"""
Expand Down Expand Up @@ -2766,15 +2808,45 @@ def drop(self, labels, errors='raise'):
return self.delete(indexer)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
@Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs)
def drop_duplicates(self, keep='first'):
return super(Index, self).drop_duplicates(keep=keep)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
@Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, keep='first'):
return super(Index, self).duplicated(keep=keep)

_index_shared_docs['fillna'] = """
Fill NA/NaN values with the specified value

Parameters
----------
value : scalar
Scalar value to use to fill holes (e.g. 0).
This value cannot be a list-likes.
downcast : dict, default is None
a dict of item->dtype of what to downcast if possible,
or the string 'infer' which will try to downcast to an appropriate
equal type (e.g. float64 to int64 if possible)

Returns
-------
filled : Index
"""

@Appender(_index_shared_docs['fillna'])
def fillna(self, value=None, downcast=None):
self._assert_can_do_op(value)
if self.hasnans:
result = self.putmask(self._isnan, value)
if downcast is None:
# no need to care metadata other than name
# because it can't have freq if
return Index(result, name=self.name)

return self._shallow_copy()

def _evaluate_with_timedelta_like(self, other, op, opstr):
raise TypeError("can only perform ops with timedelta like values")

Expand Down Expand Up @@ -3200,6 +3272,16 @@ def __array__(self, dtype=None):
""" the array interface, return my values """
return np.array(self._data, dtype=dtype)

@cache_readonly
def _isnan(self):
""" return if each value is nan"""
return self._data.codes == -1

@Appender(_index_shared_docs['fillna'])
def fillna(self, value, downcast=None):
self._assert_can_do_op(value)
return CategoricalIndex(self._data.fillna(value), name=self.name)

def argsort(self, *args, **kwargs):
return self.values.argsort(*args, **kwargs)

Expand All @@ -3214,7 +3296,7 @@ def is_unique(self):
return not self.duplicated().any()

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
@Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, keep='first'):
from pandas.hashtable import duplicated_int64
return duplicated_int64(self.codes.astype('i8'), keep)
Expand Down Expand Up @@ -3612,6 +3694,8 @@ class Int64Index(NumericIndex):
_inner_indexer = _algos.inner_join_indexer_int64
_outer_indexer = _algos.outer_join_indexer_int64

_can_hold_na = False

_engine_type = _index.Int64Engine

def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs):
Expand Down Expand Up @@ -3646,11 +3730,6 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, *
def inferred_type(self):
return 'integer'

@cache_readonly
def hasnans(self):
# by definition
return False

@property
def asi8(self):
# do not cache or you'll create a memory leak
Expand Down Expand Up @@ -3872,19 +3951,6 @@ def is_all_dates(self):
"""
return False

@cache_readonly
def _nan_idxs(self):
w, = self._isnan.nonzero()
return w

@cache_readonly
def _isnan(self):
return np.isnan(self.values)

@cache_readonly
def hasnans(self):
return self._isnan.any()

@cache_readonly
def is_unique(self):
return super(Float64Index, self).is_unique and self._nan_idxs.size < 2
Expand Down Expand Up @@ -4409,7 +4475,7 @@ def is_unique(self):
return not self.duplicated().any()

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
@Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, keep='first'):
from pandas.core.groupby import get_group_index
from pandas.hashtable import duplicated_int64
Expand All @@ -4419,6 +4485,11 @@ def duplicated(self, keep='first'):

return duplicated_int64(ids, keep)

@Appender(_index_shared_docs['fillna'])
def fillna(self, value=None, downcast=None):
# isnull is not implemented for MultiIndex
raise NotImplementedError('isnull is not defined for MultiIndex')

def get_value(self, series, key):
# somewhat broken encapsulation
from pandas.core.indexing import maybe_droplevels
Expand Down
10 changes: 5 additions & 5 deletions pandas/src/period.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ def extract_ordinals(ndarray[object] values, freq):
p = values[i]
ordinals[i] = p.ordinal
if p.freqstr != freqstr:
raise ValueError("%s is wrong freq" % p)
raise ValueError(_DIFFERENT_FREQ_INDEX.format(freqstr, p.freqstr))

return ordinals

Expand Down Expand Up @@ -624,8 +624,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps,
return result


_DIFFERENT_FREQ_ERROR = "Input has different freq={1} from Period(freq={0})"

_DIFFERENT_FREQ = "Input has different freq={1} from Period(freq={0})"
_DIFFERENT_FREQ_INDEX = "Input has different freq={1} from PeriodIndex(freq={0})"

cdef class Period(object):
"""
Expand Down Expand Up @@ -766,7 +766,7 @@ cdef class Period(object):
if isinstance(other, Period):
from pandas.tseries.frequencies import get_freq_code as _gfc
if other.freq != self.freq:
msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr)
msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr)
raise ValueError(msg)
if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT:
return _nat_scalar_rules[op]
Expand Down Expand Up @@ -807,7 +807,7 @@ cdef class Period(object):
else:
ordinal = self.ordinal + other.n
return Period(ordinal=ordinal, freq=self.freq)
msg = _DIFFERENT_FREQ_ERROR.format(self.freqstr, other.freqstr)
msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr)
raise ValueError(msg)
else: # pragma no cover
return NotImplemented
Expand Down
Loading