-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: DataFrame.quantile with NaNs (GH14357) #14536
Changes from all commits
4b5a766
1c646d7
baa7b84
cdd247b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,6 @@ | |
from collections import defaultdict | ||
|
||
import numpy as np | ||
from numpy import percentile as _quantile | ||
|
||
from pandas.core.base import PandasObject | ||
|
||
|
@@ -1315,16 +1314,38 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): | |
|
||
values = self.get_values() | ||
values, _, _, _ = self._try_coerce_args(values, values) | ||
mask = isnull(self.values) | ||
if not lib.isscalar(mask) and mask.any(): | ||
|
||
# even though this could be a 2-d mask it appears | ||
# as a 1-d result | ||
mask = mask.reshape(values.shape) | ||
result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1)) | ||
values = _block_shape(values[~mask], ndim=self.ndim) | ||
if self.ndim > 1: | ||
values = values.reshape(result_shape) | ||
def _nanpercentile1D(values, mask, q, **kw): | ||
values = values[~mask] | ||
|
||
if len(values) == 0: | ||
if is_scalar(q): | ||
return self._na_value | ||
else: | ||
return np.array([self._na_value] * len(q), | ||
dtype=values.dtype) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I might move some of this to to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason I did not put it there initially, was because this is less general as the current functions in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback Opinion about this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I meant move ALL of this; the nanops do everything (based on dtype), are basically ufuncs per-dtype. Its ok for now if you want to merge (to fix the bug). But let's open a new issue to move this code. All of the rest of it is there (for other ops). We don't do very much inside the block managers, mainly just assemble blocks, actual calculations are pushed to other routines (numpy or pandas) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sounds good, will open a new issue (and one for the failing empty ones as well) |
||
return np.percentile(values, q, **kw) | ||
|
||
def _nanpercentile(values, q, axis, **kw): | ||
|
||
mask = isnull(self.values) | ||
if not is_scalar(mask) and mask.any(): | ||
if self.ndim == 1: | ||
return _nanpercentile1D(values, mask, q, **kw) | ||
else: | ||
# for nonconsolidatable blocks mask is 1D, but values 2D | ||
if mask.ndim < values.ndim: | ||
mask = mask.reshape(values.shape) | ||
if axis == 0: | ||
values = values.T | ||
mask = mask.T | ||
result = [_nanpercentile1D(val, m, q, **kw) for (val, m) | ||
in zip(list(values), list(mask))] | ||
result = np.array(result, dtype=values.dtype, copy=False).T | ||
return result | ||
else: | ||
return np.percentile(values, q, axis=axis, **kw) | ||
|
||
from pandas import Float64Index | ||
is_empty = values.shape[axis] == 0 | ||
|
@@ -1343,13 +1364,13 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): | |
else: | ||
|
||
try: | ||
result = _quantile(values, np.array(qs) * 100, | ||
axis=axis, **kw) | ||
result = _nanpercentile(values, np.array(qs) * 100, | ||
axis=axis, **kw) | ||
except ValueError: | ||
|
||
# older numpies don't handle an array for q | ||
result = [_quantile(values, q * 100, | ||
axis=axis, **kw) for q in qs] | ||
result = [_nanpercentile(values, q * 100, | ||
axis=axis, **kw) for q in qs] | ||
|
||
result = np.array(result, copy=False) | ||
if self.ndim > 1: | ||
|
@@ -1368,7 +1389,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): | |
else: | ||
result = np.array([self._na_value] * len(self)) | ||
else: | ||
result = _quantile(values, qs * 100, axis=axis, **kw) | ||
result = _nanpercentile(values, qs * 100, axis=axis, **kw) | ||
|
||
ndim = getattr(result, 'ndim', None) or 0 | ||
result = self._try_coerce_result(result) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
from pandas.types.common import is_scalar