diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 3458cbe74d1bb3..61cecb4beb86a3 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -371,13 +371,11 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - Conversion ^^^^^^^^^^ - Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`) - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) -- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`) - Fixed the return type of ``IntervalIndex.is_non_overlapping_monotonic`` to be a Python ``bool`` for consistency with similar attributes/methods. Previously returned a ``numpy.bool_``. (:issue:`17237`) - Bug in ``IntervalIndex.is_non_overlapping_monotonic`` when intervals are closed on both sides and overlap at a point (:issue:`16560`) - Bug in :func:`Series.fillna` returns frame when ``inplace=True`` and ``value`` is dict (:issue:`16156`) @@ -463,6 +461,15 @@ Categorical the ``.categories`` to be an empty ``Float64Index`` rather than an empty ``Index`` with object dtype (:issue:`17248`) +PyPy +^^^^ + +- Compatibility with PyPy in :func:`read_csv` with ``usecols=[]`` and + :func:`read_json` (:issue:`17351`) +- Split tests into cases for CPython and PyPy where needed, which highlights the fragility + of index matching with ``float('nan')``, ``np.nan`` and ``NAT`` (:issue:`17351`) +- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, + so an approximation is used instead (:issue:`17228`) Other ^^^^^ diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index b0132532c16af7..85cf1d5e5e7a1a 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -409,7 +409,7 @@ JSOBJ Object_npyEndObject(void *prv, JSOBJ obj) { } int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { - PyObject *label; + PyObject *label, *labels; npy_intp labelidx; // add key to label array, value to values array NpyArrContext *npyarr = (NpyArrContext *)obj; @@ -424,11 +424,11 @@ int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { if (!npyarr->labels[labelidx]) { npyarr->labels[labelidx] = PyList_New(0); } - + labels = npyarr->labels[labelidx]; // only fill label array once, assumes all column labels are the same // for 2-dimensional arrays. - if (PyList_GET_SIZE(npyarr->labels[labelidx]) <= npyarr->elcount) { - PyList_Append(npyarr->labels[labelidx], label); + if (PyList_Check(labels) && PyList_GET_SIZE(labels) <= npyarr->elcount) { + PyList_Append(labels, label); } if (((JSONObjectDecoder *)npyarr->dec)->arrayAddItem(prv, obj, value)) { @@ -439,16 +439,16 @@ int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { } int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { - PyDict_SetItem(obj, name, value); + int ret = PyDict_SetItem(obj, name, value); Py_DECREF((PyObject *)name); Py_DECREF((PyObject *)value); - return 1; + return ret == 0 ? 1 : 0; } int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) { - PyList_Append(obj, value); + int ret = PyList_Append(obj, value); Py_DECREF((PyObject *)value); - return 1; + return ret == 0 ? 1 : 0; } JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) { diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8b1a921536a1dd..6adf154aabba7f 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1716,6 +1716,7 @@ def _set_noconvert_columns(self): # A set of integers will be converted to a list in # the correct order every single time. usecols = list(self.usecols) + usecols.sort() elif (callable(self.usecols) or self.usecols_dtype not in ('empty', None)): # The names attribute should have the correct columns diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index d69fbbcdf4bf60..fa73c9fc7b7225 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -9,7 +9,7 @@ from pandas.tests.indexes.common import Base from pandas.compat import (range, lrange, lzip, u, - text_type, zip, PY3, PY36) + text_type, zip, PY3, PY36, PYPY) import operator import numpy as np @@ -1370,13 +1370,21 @@ def test_isin(self): assert len(result) == 0 assert result.dtype == np.bool_ - def test_isin_nan(self): + @pytest.mark.skipif(PYPY, reason="np.nan is float('nan') on PyPy") + def test_isin_nan_not_pypy(self): + tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]), + np.array([False, False])) + + @pytest.mark.skipif(not PYPY, reason="np.nan is float('nan') on PyPy") + def test_isin_nan_pypy(self): + tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]), + np.array([False, True])) + + def test_isin_nan_common(self): tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([np.nan]), np.array([False, True])) tm.assert_numpy_array_equal(Index(['a', pd.NaT]).isin([pd.NaT]), np.array([False, True])) - tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]), - np.array([False, False])) tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([pd.NaT]), np.array([False, False])) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 798d2444689615..86308192c91665 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -14,7 +14,7 @@ from pandas import (CategoricalIndex, DataFrame, Index, MultiIndex, compat, date_range, period_range) -from pandas.compat import PY3, long, lrange, lzip, range, u +from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.core.indexes.base import InvalidIndexError from pandas._libs import lib @@ -2571,13 +2571,22 @@ def test_isin(self): assert len(result) == 0 assert result.dtype == np.bool_ - def test_isin_nan(self): + @pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy") + def test_isin_nan_not_pypy(self): idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]]) tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]), np.array([False, False])) tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]), np.array([False, False])) + @pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy") + def test_isin_nan_pypy(self): + idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]]) + tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]), + np.array([False, True])) + tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]), + np.array([False, True])) + def test_isin_level_kwarg(self): idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], np.arange( 4)]) diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 2fee2451c5e36f..0ea4757b10e942 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -3,8 +3,10 @@ import os import pandas.util.testing as tm -from pandas import read_csv, read_table +from pandas import read_csv, read_table, DataFrame from pandas.core.common import AbstractMethodError +from pandas._libs.lib import Timestamp +from pandas.compat import StringIO from .common import ParserTests from .header import HeaderTests @@ -100,3 +102,51 @@ def read_table(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = self.engine return read_table(*args, **kwds) + + +class TestUnsortedUsecols(object): + def test_override__set_noconvert_columns(self): + # GH 17351 - usecols needs to be sorted in _setnoconvert_columns + # based on the test_usecols_with_parse_dates test from usecols.py + from pandas.io.parsers import CParserWrapper, TextFileReader + + s = """a,b,c,d,e + 0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + + parse_dates = [[1, 2]] + cols = { + 'a': [0, 0], + 'c_d': [ + Timestamp('2014-01-01 09:00:00'), + Timestamp('2014-01-02 10:00:00') + ] + } + expected = DataFrame(cols, columns=['c_d', 'a']) + + class MyTextFileReader(TextFileReader): + def __init__(self): + self._currow = 0 + self.squeeze = False + + class MyCParserWrapper(CParserWrapper): + def _set_noconvert_columns(self): + if self.usecols_dtype == 'integer': + # self.usecols is a set, which is documented as unordered + # but in practice, a CPython set of integers is sorted. + # In other implementations this assumption does not hold. + # The following code simulates a different order, which + # before GH 17351 would cause the wrong columns to be + # converted via the parse_dates parameter + self.usecols = list(self.usecols) + self.usecols.reverse() + return CParserWrapper._set_noconvert_columns(self) + + parser = MyTextFileReader() + parser.options = {'usecols': [0, 2, 3], + 'parse_dates': parse_dates, + 'delimiter': ','} + parser._engine = MyCParserWrapper(StringIO(s), **parser.options) + df = parser.read() + + tm.assert_frame_equal(df, expected)