Skip to content

Commit

Permalink
BUG: Fix Series.nlargest for integer boundary values (pandas-dev#21432)
Browse files Browse the repository at this point in the history
  • Loading branch information
jschendel authored and david-liu-brattle-1 committed Jun 18, 2018
1 parent 00cd937 commit f9fea8f
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 43 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,5 @@ Bug Fixes

**Other**

- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`)
-
71 changes: 71 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ def join_type(request):
return request.param


@pytest.fixture(params=['nlargest', 'nsmallest'])
def nselect_method(request):
"""
Fixture for trying all nselect methods
"""
return request.param


@pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')])
def nulls_fixture(request):
"""
Expand Down Expand Up @@ -170,3 +178,66 @@ def string_dtype(request):
* 'U'
"""
return request.param


@pytest.fixture(params=["float32", "float64"])
def float_dtype(request):
"""
Parameterized fixture for float dtypes.
* float32
* float64
"""

return request.param


UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"]
SIGNED_INT_DTYPES = ["int8", "int16", "int32", "int64"]
ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES


@pytest.fixture(params=SIGNED_INT_DTYPES)
def sint_dtype(request):
"""
Parameterized fixture for signed integer dtypes.
* int8
* int16
* int32
* int64
"""

return request.param


@pytest.fixture(params=UNSIGNED_INT_DTYPES)
def uint_dtype(request):
"""
Parameterized fixture for unsigned integer dtypes.
* uint8
* uint16
* uint32
* uint64
"""

return request.param


@pytest.fixture(params=ALL_INT_DTYPES)
def any_int_dtype(request):
"""
Parameterized fixture for any integer dtypes.
* int8
* uint8
* int16
* uint16
* int32
* uint32
* int64
* uint64
"""

return request.param
5 changes: 4 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1133,9 +1133,12 @@ def compute(self, method):
return dropped[slc].sort_values(ascending=ascending).head(n)

# fast method
arr, _, _ = _ensure_data(dropped.values)
arr, pandas_dtype, _ = _ensure_data(dropped.values)
if method == 'nlargest':
arr = -arr
if is_integer_dtype(pandas_dtype):
# GH 21426: ensure reverse ordering at boundaries
arr -= 1

if self.keep == 'last':
arr = arr[::-1]
Expand Down
78 changes: 36 additions & 42 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from numpy.random import randn
import numpy as np

from pandas.compat import lrange, product, PY35
from pandas.compat import lrange, PY35
from pandas import (compat, isna, notna, DataFrame, Series,
MultiIndex, date_range, Timestamp, Categorical,
_np_version_under1p12, _np_version_under1p15,
Expand Down Expand Up @@ -2260,54 +2260,49 @@ class TestNLargestNSmallest(object):

# ----------------------------------------------------------------------
# Top / bottom
@pytest.mark.parametrize(
'method, n, order',
product(['nsmallest', 'nlargest'], range(1, 11),
[['a'],
['c'],
['a', 'b'],
['a', 'c'],
['b', 'a'],
['b', 'c'],
['a', 'b', 'c'],
['c', 'a', 'b'],
['c', 'b', 'a'],
['b', 'c', 'a'],
['b', 'a', 'c'],
# dups!
['b', 'c', 'c'],
]))
def test_n(self, df_strings, method, n, order):
@pytest.mark.parametrize('order', [
['a'],
['c'],
['a', 'b'],
['a', 'c'],
['b', 'a'],
['b', 'c'],
['a', 'b', 'c'],
['c', 'a', 'b'],
['c', 'b', 'a'],
['b', 'c', 'a'],
['b', 'a', 'c'],
# dups!
['b', 'c', 'c']])
@pytest.mark.parametrize('n', range(1, 11))
def test_n(self, df_strings, nselect_method, n, order):
# GH10393
df = df_strings
if 'b' in order:

error_msg = self.dtype_error_msg_template.format(
column='b', method=method, dtype='object')
column='b', method=nselect_method, dtype='object')
with tm.assert_raises_regex(TypeError, error_msg):
getattr(df, method)(n, order)
getattr(df, nselect_method)(n, order)
else:
ascending = method == 'nsmallest'
result = getattr(df, method)(n, order)
ascending = nselect_method == 'nsmallest'
result = getattr(df, nselect_method)(n, order)
expected = df.sort_values(order, ascending=ascending).head(n)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
'method, columns',
product(['nsmallest', 'nlargest'],
product(['group'], ['category_string', 'string'])
))
def test_n_error(self, df_main_dtypes, method, columns):
@pytest.mark.parametrize('columns', [
('group', 'category_string'), ('group', 'string')])
def test_n_error(self, df_main_dtypes, nselect_method, columns):
df = df_main_dtypes
col = columns[1]
error_msg = self.dtype_error_msg_template.format(
column=columns[1], method=method, dtype=df[columns[1]].dtype)
column=col, method=nselect_method, dtype=df[col].dtype)
# escape some characters that may be in the repr
error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)")
.replace("[", "\\[").replace("]", "\\]"))
with tm.assert_raises_regex(TypeError, error_msg):
getattr(df, method)(2, columns)
getattr(df, nselect_method)(2, columns)

def test_n_all_dtypes(self, df_main_dtypes):
df = df_main_dtypes
Expand All @@ -2328,15 +2323,14 @@ def test_n_identical_values(self):
expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]})
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
'n, order',
product([1, 2, 3, 4, 5],
[['a', 'b', 'c'],
['c', 'b', 'a'],
['a'],
['b'],
['a', 'b'],
['c', 'b']]))
@pytest.mark.parametrize('order', [
['a', 'b', 'c'],
['c', 'b', 'a'],
['a'],
['b'],
['a', 'b'],
['c', 'b']])
@pytest.mark.parametrize('n', range(1, 6))
def test_n_duplicate_index(self, df_duplicates, n, order):
# GH 13412

Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1944,6 +1944,15 @@ def test_mode_sortwarning(self):
tm.assert_series_equal(result, expected)


def assert_check_nselect_boundary(vals, dtype, method):
# helper function for 'test_boundary_{dtype}' tests
s = Series(vals, dtype=dtype)
result = getattr(s, method)(3)
expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1]
expected = s.loc[expected_idxr]
tm.assert_series_equal(result, expected)


class TestNLargestNSmallest(object):

@pytest.mark.parametrize(
Expand Down Expand Up @@ -2028,6 +2037,32 @@ def test_n(self, n):
expected = s.sort_values().head(n)
assert_series_equal(result, expected)

def test_boundary_integer(self, nselect_method, any_int_dtype):
# GH 21426
dtype_info = np.iinfo(any_int_dtype)
min_val, max_val = dtype_info.min, dtype_info.max
vals = [min_val, min_val + 1, max_val - 1, max_val]
assert_check_nselect_boundary(vals, any_int_dtype, nselect_method)

def test_boundary_float(self, nselect_method, float_dtype):
# GH 21426
dtype_info = np.finfo(float_dtype)
min_val, max_val = dtype_info.min, dtype_info.max
min_2nd, max_2nd = np.nextafter(
[min_val, max_val], 0, dtype=float_dtype)
vals = [min_val, min_2nd, max_2nd, max_val]
assert_check_nselect_boundary(vals, float_dtype, nselect_method)

@pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]'])
def test_boundary_datetimelike(self, nselect_method, dtype):
# GH 21426
# use int64 bounds and +1 to min_val since true minimum is NaT
# (include min_val/NaT at end to maintain same expected_idxr)
dtype_info = np.iinfo('int64')
min_val, max_val = dtype_info.min, dtype_info.max
vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val]
assert_check_nselect_boundary(vals, dtype, nselect_method)


class TestCategoricalSeriesAnalytics(object):

Expand Down

0 comments on commit f9fea8f

Please sign in to comment.