Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: DataFrame.quantile with NaNs (GH14357) #14536

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.19.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ Bug Fixes


- Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`)

- Regression in ``DataFrame.quantile`` when missing values where present in some columns (:issue:`14357`).
- Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`)
- Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`)
- Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`)
Expand Down
51 changes: 36 additions & 15 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from collections import defaultdict

import numpy as np
from numpy import percentile as _quantile

from pandas.core.base import PandasObject

Expand Down Expand Up @@ -1315,16 +1314,38 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None):

values = self.get_values()
values, _, _, _ = self._try_coerce_args(values, values)
mask = isnull(self.values)
if not lib.isscalar(mask) and mask.any():

# even though this could be a 2-d mask it appears
# as a 1-d result
mask = mask.reshape(values.shape)
result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1))
values = _block_shape(values[~mask], ndim=self.ndim)
if self.ndim > 1:
values = values.reshape(result_shape)
def _nanpercentile1D(values, mask, q, **kw):
values = values[~mask]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from pandas.types.common import is_scalar


if len(values) == 0:
if is_scalar(q):
return self._na_value
else:
return np.array([self._na_value] * len(q),
dtype=values.dtype)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might move some of this to to pandas.core.nanops (though you might have to move slightly more as that takes axis arg). Its esentially what you are doing here, but in a slightly more general framework. call it nanquantile (or nanpercentile)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason I did not put it there initially, was because this is less general as the current functions in the nanops module. For example, I pass here the mask alongside the values because datetimelike values are already converted to integers at this point (where the NaTs are filled) because np.percentile cannot deal with datetime-like values

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback Opinion about this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant move ALL of this; the nanops do everything (based on dtype), are basically ufuncs per-dtype. Its ok for now if you want to merge (to fix the bug). But let's open a new issue to move this code. All of the rest of it is there (for other ops). We don't do very much inside the block managers, mainly just assemble blocks, actual calculations are pushed to other routines (numpy or pandas)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good, will open a new issue (and one for the failing empty ones as well)

return np.percentile(values, q, **kw)

def _nanpercentile(values, q, axis, **kw):

mask = isnull(self.values)
if not is_scalar(mask) and mask.any():
if self.ndim == 1:
return _nanpercentile1D(values, mask, q, **kw)
else:
# for nonconsolidatable blocks mask is 1D, but values 2D
if mask.ndim < values.ndim:
mask = mask.reshape(values.shape)
if axis == 0:
values = values.T
mask = mask.T
result = [_nanpercentile1D(val, m, q, **kw) for (val, m)
in zip(list(values), list(mask))]
result = np.array(result, dtype=values.dtype, copy=False).T
return result
else:
return np.percentile(values, q, axis=axis, **kw)

from pandas import Float64Index
is_empty = values.shape[axis] == 0
Expand All @@ -1343,13 +1364,13 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
else:

try:
result = _quantile(values, np.array(qs) * 100,
axis=axis, **kw)
result = _nanpercentile(values, np.array(qs) * 100,
axis=axis, **kw)
except ValueError:

# older numpies don't handle an array for q
result = [_quantile(values, q * 100,
axis=axis, **kw) for q in qs]
result = [_nanpercentile(values, q * 100,
axis=axis, **kw) for q in qs]

result = np.array(result, copy=False)
if self.ndim > 1:
Expand All @@ -1368,7 +1389,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
else:
result = np.array([self._na_value] * len(self))
else:
result = _quantile(values, qs * 100, axis=axis, **kw)
result = _nanpercentile(values, qs * 100, axis=axis, **kw)

ndim = getattr(result, 'ndim', None) or 0
result = self._try_coerce_result(result)
Expand Down
97 changes: 97 additions & 0 deletions pandas/tests/frame/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,11 @@ def test_quantile_datetime(self):
index=[0.5], columns=[0, 1])
assert_frame_equal(result, expected)

# empty when numeric_only=True
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# result = df[['a', 'c']].quantile(.5)
# result = df[['a', 'c']].quantile([.5])

def test_quantile_invalid(self):
msg = 'percentiles should all be in the interval \\[0, 1\\]'
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
Expand Down Expand Up @@ -340,3 +345,95 @@ def test_quantile_box(self):
pd.Timedelta('2 days')]],
index=[0.5], columns=list('AaBbCc'))
tm.assert_frame_equal(res, exp)

def test_quantile_nan(self):

# GH 14357 - float block where some cols have missing values
df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)})
df.iloc[-1, 1] = np.nan

res = df.quantile(0.5)
exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5)
tm.assert_series_equal(res, exp)

res = df.quantile([0.5, 0.75])
exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)

res = df.quantile(0.5, axis=1)
exp = Series(np.arange(1.0, 6.0), name=0.5)
tm.assert_series_equal(res, exp)

res = df.quantile([0.5, 0.75], axis=1)
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)

# full-nan column
df['b'] = np.nan

res = df.quantile(0.5)
exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5)
tm.assert_series_equal(res, exp)

res = df.quantile([0.5, 0.75])
exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]},
index=[0.5, 0.75])
tm.assert_frame_equal(res, exp)

def test_quantile_nat(self):

# full NaT column
df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]})

res = df.quantile(0.5, numeric_only=False)
exp = Series([pd.NaT], index=['a'], name=0.5)
tm.assert_series_equal(res, exp)

res = df.quantile([0.5], numeric_only=False)
exp = DataFrame({'a': [pd.NaT]}, index=[0.5])
tm.assert_frame_equal(res, exp)

# mixed non-null / full null column
df = DataFrame({'a': [pd.Timestamp('2012-01-01'),
pd.Timestamp('2012-01-02'),
pd.Timestamp('2012-01-03')],
'b': [pd.NaT, pd.NaT, pd.NaT]})

res = df.quantile(0.5, numeric_only=False)
exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'],
name=0.5)
tm.assert_series_equal(res, exp)

res = df.quantile([0.5], numeric_only=False)
exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5],
columns=['a', 'b'])
tm.assert_frame_equal(res, exp)

def test_quantile_empty(self):

# floats
df = DataFrame(columns=['a', 'b'], dtype='float64')

res = df.quantile(0.5)
exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5)
tm.assert_series_equal(res, exp)

res = df.quantile([0.5])
exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5])
tm.assert_frame_equal(res, exp)

# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# res = df.quantile(0.5, axis=1)
# res = df.quantile([0.5], axis=1)

# ints
df = DataFrame(columns=['a', 'b'], dtype='int64')

# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
# res = df.quantile(0.5)

# datetimes
df = DataFrame(columns=['a', 'b'], dtype='datetime64')

# FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
# res = df.quantile(0.5, numeric_only=False)
32 changes: 32 additions & 0 deletions pandas/tests/series/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,35 @@ def test_quantile_nat(self):

res = Series([pd.NaT, pd.NaT]).quantile([0.5])
tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5]))

def test_quantile_empty(self):

# floats
s = Series([], dtype='float64')

res = s.quantile(0.5)
self.assertTrue(np.isnan(res))

res = s.quantile([0.5])
exp = Series([np.nan], index=[0.5])
tm.assert_series_equal(res, exp)

# int
s = Series([], dtype='int64')

res = s.quantile(0.5)
self.assertTrue(np.isnan(res))

res = s.quantile([0.5])
exp = Series([np.nan], index=[0.5])
tm.assert_series_equal(res, exp)

# datetime
s = Series([], dtype='datetime64[ns]')

res = s.quantile(0.5)
self.assertTrue(res is pd.NaT)

res = s.quantile([0.5])
exp = Series([pd.NaT], index=[0.5])
tm.assert_series_equal(res, exp)