From 4b5a766ba28e6d305972a09aa7457ab45e2024d6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 28 Oct 2016 15:30:41 +0200 Subject: [PATCH 1/4] BUG: DataFrame.quantile with NaNs (GH14357) np.percentile cannot handle a block with NaNs, and the masking approach only worked with regularly placed NaNs. Solution: when missing values are present, use np.nanpercentile when available, otherwise use np.percentile applied along the axis --- pandas/core/internals.py | 44 ++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 11721a5bdac29..03b5af884a2a4 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -6,7 +6,6 @@ from collections import defaultdict import numpy as np -from numpy import percentile as _quantile from pandas.core.base import PandasObject @@ -1315,16 +1314,31 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): values = self.get_values() values, _, _, _ = self._try_coerce_args(values, values) - mask = isnull(self.values) - if not lib.isscalar(mask) and mask.any(): - # even though this could be a 2-d mask it appears - # as a 1-d result - mask = mask.reshape(values.shape) - result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1)) - values = _block_shape(values[~mask], ndim=self.ndim) - if self.ndim > 1: - values = values.reshape(result_shape) + def _nanpercentile(values, q, axis, **kw): + + mask = isnull(values) + if not lib.isscalar(mask) and mask.any(): + if _np_version_under1p9: + mask = isnull(values) + if self.ndim == 1: + values = values[~mask] + return np.percentile(values, q, axis=axis, **kw) + else: + if axis == 0: + values = values.T + mask = mask.T + result = [np.percentile(val[~m], q, **kw) for (val, m) + in zip(list(values), list(mask))] + result = np.array(result, copy=False).T + return result + else: + result = np.nanpercentile(values, q, axis=axis, **kw) + if result.ndim == 0: + result = result.item() + return result + else: + return np.percentile(values, q, axis=axis, **kw) from pandas import Float64Index is_empty = values.shape[axis] == 0 @@ -1343,13 +1357,13 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): else: try: - result = _quantile(values, np.array(qs) * 100, - axis=axis, **kw) + result = _nanpercentile(values, np.array(qs) * 100, + axis=axis, **kw) except ValueError: # older numpies don't handle an array for q - result = [_quantile(values, q * 100, - axis=axis, **kw) for q in qs] + result = [_nanpercentile(values, q * 100, + axis=axis, **kw) for q in qs] result = np.array(result, copy=False) if self.ndim > 1: @@ -1368,7 +1382,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): else: result = np.array([self._na_value] * len(self)) else: - result = _quantile(values, qs * 100, axis=axis, **kw) + result = _nanpercentile(values, qs * 100, axis=axis, **kw) ndim = getattr(result, 'ndim', None) or 0 result = self._try_coerce_result(result) From 1c646d73c3770761728a7d86e7424a6a37a7c8f6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 28 Oct 2016 19:52:12 +0200 Subject: [PATCH 2/4] deal with empty / all NaN --- pandas/core/internals.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 03b5af884a2a4..d4780ceeb1061 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1315,20 +1315,31 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): values = self.get_values() values, _, _, _ = self._try_coerce_args(values, values) + def _nanpercentile1D(values, mask, q, **kw): + values = values[~mask] + + if len(values) == 0: + if is_list_like(q): + return np.array([self._na_value] * len(q)) + else: + return self._na_value + + return np.percentile(values, q, **kw) + def _nanpercentile(values, q, axis, **kw): mask = isnull(values) if not lib.isscalar(mask) and mask.any(): - if _np_version_under1p9: + #if _np_version_under1p9: + if True: mask = isnull(values) if self.ndim == 1: - values = values[~mask] - return np.percentile(values, q, axis=axis, **kw) + return _nanpercentile1D(values, mask, q, axis=axis, **kw) else: if axis == 0: values = values.T mask = mask.T - result = [np.percentile(val[~m], q, **kw) for (val, m) + result = [_nanpercentile1D(val, m, q, **kw) for (val, m) in zip(list(values), list(mask))] result = np.array(result, copy=False).T return result From baa7b848e179f0a2e449cccbb768c4a0471cfe38 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 29 Oct 2016 10:11:17 +0200 Subject: [PATCH 3/4] deal with non-consolidatable difference in ndim --- doc/source/whatsnew/v0.19.1.txt | 2 +- pandas/core/internals.py | 36 +++++------ pandas/tests/frame/test_quantile.py | 97 ++++++++++++++++++++++++++++ pandas/tests/series/test_quantile.py | 32 +++++++++ 4 files changed, 146 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index a81ab6ed0311c..7132a25f72870 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -63,7 +63,7 @@ Bug Fixes - Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`) - +- Regression in ``DataFrame.quantile`` when missing values where present in some columns (:issue:`14357`). - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) - Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index d4780ceeb1061..beedfd981bdea 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1319,34 +1319,30 @@ def _nanpercentile1D(values, mask, q, **kw): values = values[~mask] if len(values) == 0: - if is_list_like(q): - return np.array([self._na_value] * len(q)) - else: + if lib.isscalar(q): return self._na_value + else: + return np.array([self._na_value] * len(q), + dtype=values.dtype) return np.percentile(values, q, **kw) def _nanpercentile(values, q, axis, **kw): - mask = isnull(values) + mask = isnull(self.values) if not lib.isscalar(mask) and mask.any(): - #if _np_version_under1p9: - if True: - mask = isnull(values) - if self.ndim == 1: - return _nanpercentile1D(values, mask, q, axis=axis, **kw) - else: - if axis == 0: - values = values.T - mask = mask.T - result = [_nanpercentile1D(val, m, q, **kw) for (val, m) - in zip(list(values), list(mask))] - result = np.array(result, copy=False).T - return result + if self.ndim == 1: + return _nanpercentile1D(values, mask, q, **kw) else: - result = np.nanpercentile(values, q, axis=axis, **kw) - if result.ndim == 0: - result = result.item() + # for nonconsolidatable blocks mask is 1D, but values 2D + if mask.ndim < values.ndim: + mask = mask.reshape(values.shape) + if axis == 0: + values = values.T + mask = mask.T + result = [_nanpercentile1D(val, m, q, **kw) for (val, m) + in zip(list(values), list(mask))] + result = np.array(result, dtype=values.dtype, copy=False).T return result else: return np.percentile(values, q, axis=axis, **kw) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 52e8697abe850..22414a6ba8a53 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -262,6 +262,11 @@ def test_quantile_datetime(self): index=[0.5], columns=[0, 1]) assert_frame_equal(result, expected) + # empty when numeric_only=True + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # result = df[['a', 'c']].quantile(.5) + # result = df[['a', 'c']].quantile([.5]) + def test_quantile_invalid(self): msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: @@ -340,3 +345,95 @@ def test_quantile_box(self): pd.Timedelta('2 days')]], index=[0.5], columns=list('AaBbCc')) tm.assert_frame_equal(res, exp) + + def test_quantile_nan(self): + + # GH 14357 - float block where some cols have missing values + df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)}) + df.iloc[-1, 1] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + res = df.quantile(0.5, axis=1) + exp = Series(np.arange(1.0, 6.0), name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75], axis=1) + exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + # full-nan column + df['b'] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]}, + index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + def test_quantile_nat(self): + + # full NaT column + df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.NaT], index=['a'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame({'a': [pd.NaT]}, index=[0.5]) + tm.assert_frame_equal(res, exp) + + # mixed non-null / full null column + df = DataFrame({'a': [pd.Timestamp('2012-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2012-01-03')], + 'b': [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'], + name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5], + columns=['a', 'b']) + tm.assert_frame_equal(res, exp) + + def test_quantile_empty(self): + + # floats + df = DataFrame(columns=['a', 'b'], dtype='float64') + + res = df.quantile(0.5) + exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5]) + exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5]) + tm.assert_frame_equal(res, exp) + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5, axis=1) + # res = df.quantile([0.5], axis=1) + + # ints + df = DataFrame(columns=['a', 'b'], dtype='int64') + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5) + + # datetimes + df = DataFrame(columns=['a', 'b'], dtype='datetime64') + + # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) + # res = df.quantile(0.5, numeric_only=False) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 7d2517987e526..76db6c90a685f 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -184,3 +184,35 @@ def test_quantile_nat(self): res = Series([pd.NaT, pd.NaT]).quantile([0.5]) tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5])) + + def test_quantile_empty(self): + + # floats + s = Series([], dtype='float64') + + res = s.quantile(0.5) + self.assertTrue(np.isnan(res)) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # int + s = Series([], dtype='int64') + + res = s.quantile(0.5) + self.assertTrue(np.isnan(res)) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # datetime + s = Series([], dtype='datetime64[ns]') + + res = s.quantile(0.5) + self.assertTrue(res is pd.NaT) + + res = s.quantile([0.5]) + exp = Series([pd.NaT], index=[0.5]) + tm.assert_series_equal(res, exp) From cdd247b49d5a99f148f2b9cd45af270d17d21c7f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 1 Nov 2016 11:40:22 +0100 Subject: [PATCH 4/4] use types.common.is_scalar instead of lib.isscalar --- pandas/core/internals.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index beedfd981bdea..a96ee64b4cfb6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1319,7 +1319,7 @@ def _nanpercentile1D(values, mask, q, **kw): values = values[~mask] if len(values) == 0: - if lib.isscalar(q): + if is_scalar(q): return self._na_value else: return np.array([self._na_value] * len(q), @@ -1330,7 +1330,7 @@ def _nanpercentile1D(values, mask, q, **kw): def _nanpercentile(values, q, axis, **kw): mask = isnull(self.values) - if not lib.isscalar(mask) and mask.any(): + if not is_scalar(mask) and mask.any(): if self.ndim == 1: return _nanpercentile1D(values, mask, q, **kw) else: