diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 56cb22741b9a35..ec6dba05b2b0eb 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -72,7 +72,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.RangeIndex.from_range PR01,SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 14af2b8a120e0b..fb89fd2a5ffb29 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -35,6 +35,7 @@ Rolling window functions Rolling.skew Rolling.kurt Rolling.apply + Rolling.pipe Rolling.aggregate Rolling.quantile Rolling.sem @@ -76,6 +77,7 @@ Expanding window functions Expanding.skew Expanding.kurt Expanding.apply + Expanding.pipe Expanding.aggregate Expanding.quantile Expanding.sem diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7c165c87adb462..daf323acff1298 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2340,6 +2340,7 @@ Read a URL with no options: .. code-block:: ipython In [320]: url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list" + In [321]: pd.read_html(url) Out[321]: [ Bank NameBank CityCity StateSt ... Acquiring InstitutionAI Closing DateClosing FundFund @@ -2366,6 +2367,7 @@ Read a URL while passing headers alongside the HTTP request: .. code-block:: ipython In [322]: url = 'https://www.sump.org/notes/request/' # HTTP request reflector + In [323]: pd.read_html(url) Out[323]: [ 0 1 @@ -2378,14 +2380,16 @@ Read a URL while passing headers alongside the HTTP request: 1 Host: www.sump.org 2 User-Agent: Python-urllib/3.8 3 Connection: close] + In [324]: headers = { - In [325]: 'User-Agent':'Mozilla Firefox v14.0', - In [326]: 'Accept':'application/json', - In [327]: 'Connection':'keep-alive', - In [328]: 'Auth':'Bearer 2*/f3+fe68df*4' - In [329]: } - In [340]: pd.read_html(url, storage_options=headers) - Out[340]: + .....: 'User-Agent':'Mozilla Firefox v14.0', + .....: 'Accept':'application/json', + .....: 'Connection':'keep-alive', + .....: 'Auth':'Bearer 2*/f3+fe68df*4' + .....: } + + In [325]: pd.read_html(url, storage_options=headers) + Out[325]: [ 0 1 0 Remote Socket: 51.15.105.256:51760 1 Protocol Version: HTTP/1.1 diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index b107a5d3ba1009..9e0e095eb4de89 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -35,8 +35,8 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to work correctly with NumPy >= 2 (:issue:`57739`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) -- .. --------------------------------------------------------------------------- .. _whatsnew_230.notable_bug_fixes: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ed0fe07f196b84..38a5e0ccab84b8 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,6 +30,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) +- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) - :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) @@ -44,6 +45,7 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :class:`Rolling` and :class:`Expanding` now support ``pipe`` method (:issue:`57076`) - :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) @@ -51,6 +53,7 @@ Other enhancements - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) +- :meth:`DataFrame.to_json` now encodes ``Decimal`` as strings instead of floats (:issue:`60698`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) @@ -687,6 +690,7 @@ MultiIndex - :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) - :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`) - :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) +- Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`) - I/O diff --git a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h index 0d62bb0ba915cd..51fdbc50bba571 100644 --- a/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h +++ b/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h @@ -170,8 +170,8 @@ typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc); typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc); typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc); typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc); -typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen); +typedef const char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen); typedef void *(*JSPFN_MALLOC)(size_t size); typedef void (*JSPFN_FREE)(void *pptr); typedef void *(*JSPFN_REALLOC)(void *base, size_t size); diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index c9f7a796a9b1ce..61e96fc835e4d2 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -148,7 +148,7 @@ int parser_init(parser_t *self) { self->warn_msg = NULL; // token stream - self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); + self->stream = malloc(STREAM_INIT_SIZE); if (self->stream == NULL) { parser_cleanup(self); return PARSER_OUT_OF_MEMORY; @@ -221,9 +221,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { char *orig_ptr = (void *)self->stream; TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) - self->stream = - (char *)grow_buffer((void *)self->stream, self->stream_len, - &self->stream_cap, nbytes * 2, sizeof(char), &status); + self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, + &self->stream_cap, nbytes * 2, 1, &status); TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, " "self->stream_cap=%zu, status=%zu\n", self->stream, self->stream_len, self->stream_cap, status)) diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index c8d8b5ab6bd6e1..1564ecb64b01de 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -920,7 +920,7 @@ Perhaps implement recursion detection */ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) { const char *value; - char *objName; + const char *objName; int count; JSOBJ iterObj; size_t szlen; diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 5f35860c59cb75..8342dbcd1763d1 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -53,8 +53,8 @@ Numeric decoder derived from TCL library npy_int64 get_nat(void) { return NPY_MIN_INT64; } -typedef char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, - size_t *_outLen); +typedef const char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, + size_t *_outLen); int object_is_decimal_type(PyObject *obj); int object_is_dataframe_type(PyObject *obj); @@ -106,7 +106,7 @@ typedef struct __TypeContext { double doubleValue; JSINT64 longValue; - char *cStr; + const char *cStr; NpyArrContext *npyarr; PdBlockContext *pdblock; int transpose; @@ -301,14 +301,15 @@ static npy_float64 total_seconds(PyObject *td) { return double_val; } -static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), - size_t *_outLen) { +static const char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), + size_t *_outLen) { PyObject *obj = (PyObject *)_obj; *_outLen = PyBytes_GET_SIZE(obj); return PyBytes_AS_STRING(obj); } -static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { +static const char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, + size_t *_outLen) { char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); if (encoded == NULL) { /* Something went wrong. @@ -321,8 +322,8 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { } /* JSON callback. returns a char* and mutates the pointer to *len */ -static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { +static const char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); @@ -330,15 +331,15 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), } /* JSON callback. returns a char* and mutates the pointer to *len */ -static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { +static const char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len); return GET_TC(tc)->cStr; } /* JSON callback */ -static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, - size_t *len) { +static const char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, + size_t *len) { if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; @@ -349,7 +350,8 @@ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, return PyDateTimeToIso(obj, base, len); } -static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { +static const char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, + size_t *outLen) { PyObject *obj = (PyObject *)_obj; PyObject *str = PyObject_CallMethod(obj, "isoformat", NULL); if (str == NULL) { @@ -373,6 +375,27 @@ static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { return outValue; } +static const char *PyDecimalToUTF8Callback(JSOBJ _obj, JSONTypeContext *tc, + size_t *len) { + PyObject *obj = (PyObject *)_obj; + PyObject *format_spec = PyUnicode_FromStringAndSize("f", 1); + PyObject *str = PyObject_Format(obj, format_spec); + Py_DECREF(format_spec); + + if (str == NULL) { + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } + + GET_TC(tc)->newObj = str; + + Py_ssize_t s_len; + char *outValue = (char *)PyUnicode_AsUTF8AndSize(str, &s_len); + *len = s_len; + + return outValue; +} + //============================================================================= // Numpy array iteration functions //============================================================================= @@ -537,10 +560,10 @@ static JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; @@ -588,11 +611,11 @@ static int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { return NpyArr_iterNextItem(obj, tc); } -static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { const npy_intp idx = blkCtxt->colIdx - 1; @@ -610,12 +633,12 @@ static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, return cStr; } -static char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), - JSONTypeContext *tc, - size_t *outLen) { +static const char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, + size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - char *cStr; + const char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; @@ -796,9 +819,9 @@ static JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -843,9 +866,9 @@ static JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Set_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *Set_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -941,8 +964,8 @@ static JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -973,9 +996,9 @@ static JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *List_iterGetName(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { +static const char *List_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -984,24 +1007,16 @@ static char *List_iterGetName(JSOBJ Py_UNUSED(obj), //============================================================================= static void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->cStr = "name"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1021,8 +1036,8 @@ static JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1033,28 +1048,20 @@ static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); enc->outputFormat = VALUES; // for contained series - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->cStr = "name"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->cStr = "index"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; GET_TC(tc)->itemValue = get_values(obj); if (!GET_TC(tc)->itemValue) { return 0; @@ -1076,8 +1083,8 @@ static JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1088,28 +1095,20 @@ static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, static void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); enc->outputFormat = VALUES; // for contained series & index - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } } static int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (!GET_TC(tc)->cStr) { - return 0; - } - const Py_ssize_t index = GET_TC(tc)->index; Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); + GET_TC(tc)->cStr = "columns"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->cStr = "index"; GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->cStr = "data"; Py_INCREF(obj); GET_TC(tc)->itemValue = obj; } else { @@ -1129,8 +1128,8 @@ static JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1180,8 +1179,8 @@ static JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { +static const char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -1467,8 +1466,18 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_UTF8; return; } else if (object_is_decimal_type(obj)) { - pc->doubleValue = PyFloat_AsDouble(obj); - tc->type = JT_DOUBLE; + PyObject *is_nan_py = PyObject_RichCompare(obj, obj, Py_NE); + if (is_nan_py == NULL) { + goto INVALID; + } + int is_nan = (is_nan_py == Py_True); + Py_DECREF(is_nan_py); + if (is_nan) { + tc->type = JT_NULL; + return; + } + pc->PyTypeToUTF8 = PyDecimalToUTF8Callback; + tc->type = JT_UTF8; return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { if (object_is_nat_type(obj)) { @@ -1871,7 +1880,6 @@ static void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->rowLabels = NULL; NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); GET_TC(tc)->columnLabels = NULL; - PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); tc->prv = NULL; @@ -1922,8 +1930,8 @@ static JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->iterGetValue(obj, tc); } -static char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, - size_t *outLen) { +static const char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen) { return GET_TC(tc)->iterGetName(obj, tc, outLen); } diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index d6d69a49c95397..f697180da5eeb2 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -679,7 +679,7 @@ cdef char* c_strftime(npy_datetimestruct *dts, char *fmt): c_date.tm_yday = get_day_of_year(dts.year, dts.month, dts.day) - 1 c_date.tm_isdst = -1 - result = malloc(result_len * sizeof(char)) + result = malloc(result_len) if result is NULL: raise MemoryError() diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index a18a1e9d5cbb79..c1178c72f3edc6 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -3,6 +3,7 @@ """ from pandas._libs import NaTType +from pandas._libs.lib import NoDefault from pandas._libs.missing import NAType from pandas.core.groupby import ( @@ -44,6 +45,7 @@ "JsonReader", "NAType", "NaTType", + "NoDefault", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", diff --git a/pandas/conftest.py b/pandas/conftest.py index 106518678df6a3..f9c10a7758bd20 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1317,6 +1317,22 @@ def nullable_string_dtype(request): return request.param +@pytest.fixture( + params=[ + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + ] +) +def pyarrow_string_dtype(request): + """ + Parametrized fixture for string dtypes backed by Pyarrow. + + * 'str[pyarrow]' + * 'string[pyarrow]' + """ + return pd.StringDtype(*request.param) + + @pytest.fixture( params=[ "python", diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 56f8adda932519..eefe08859c1e91 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1012,7 +1012,7 @@ def mode( return npresult, res_mask # type: ignore[return-value] try: - npresult = np.sort(npresult) + npresult = safe_sort(npresult) except TypeError as err: warnings.warn( f"Unable to sort modes: {err}", diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4d9c8eb3a41b64..900548a239c8ec 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -41,6 +41,7 @@ is_list_like, is_numeric_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -1619,6 +1620,9 @@ def _accumulate( ------ NotImplementedError : subclass does not define accumulations """ + if is_string_dtype(self): + return self._str_accumulate(name=name, skipna=skipna, **kwargs) + pyarrow_name = { "cummax": "cumulative_max", "cummin": "cumulative_min", @@ -1654,6 +1658,57 @@ def _accumulate( return type(self)(result) + def _str_accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ArrowExtensionArray | ExtensionArray: + """ + Accumulate implementation for strings, see `_accumulate` docstring for details. + + pyarrow.compute does not implement these methods for strings. + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: pa.array | None = None + na_mask: pa.array | None = None + pa_array = self._pa_array + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = pc.is_null(pa_array) + if pc.all(na_mask) == pa.scalar(True): + return type(self)(pa_array) + if skipna: + if name == "cumsum": + pa_array = pc.fill_null(pa_array, "") + else: + # We can retain the running min/max by forward/backward filling. + pa_array = pc.fill_null_forward(pa_array) + pa_array = pc.fill_null_backward(pa_array) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = pc.index(na_mask, True).as_py() + tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) + pa_array = pa_array[:idx] + + # error: Cannot call function of unknown type + pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] + + if tail is not None: + pa_result = pa.concat_arrays([pa_result, tail]) + elif na_mask is not None: + pa_result = pc.if_else(na_mask, None, pa_result) + + result = type(self)(pa_result) + return result + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ Return a pyarrow scalar result of performing the reduction operation. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3b881cfd2df2fa..623a6a10c75b53 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -533,6 +533,11 @@ def _str_map_nan_semantics( else: return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + def view(self, dtype: Dtype | None = None) -> ArrayLike: + if dtype is not None: + raise TypeError("Cannot change data-type for string array.") + return super().view(dtype=dtype) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1dd1b12d6ae959..1eb1a630056a20 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2339,7 +2339,7 @@ def construct_from_string(cls, string: str) -> ArrowDtype: ) if not string.endswith("[pyarrow]"): raise TypeError(f"'{string}' must end with '[pyarrow]'") - if string == "string[pyarrow]": + if string in ("string[pyarrow]", "str[pyarrow]"): # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") if pa_version_under10p1: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 851bc1ce4075c0..ffffaeba4196ea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7967,6 +7967,16 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer] new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer] + + # GH#60498 For MultiIndex column alignment + if isinstance(cols, MultiIndex): + # When overwriting column names, make a shallow copy so as to not modify + # the input DFs + new_left = new_left.copy(deep=False) + new_right = new_right.copy(deep=False) + new_left.columns = cols + new_right.columns = cols + result = op(new_left, new_right) # Do the join on the columns instead of using left._align_for_op @@ -7997,6 +8007,13 @@ def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> b if not isinstance(right, DataFrame): return False + if ( + isinstance(self.columns, MultiIndex) + or isinstance(right.columns, MultiIndex) + ) and not self.columns.equals(right.columns): + # GH#60498 Reindex if MultiIndexe columns are not matching + return True + if fill_value is None and level is None and axis == 1: # TODO: any other cases we should handle here? diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 935762d0455c5d..2db50bbbdfa371 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -190,10 +190,31 @@ def from_range(cls, data: range, name=None, dtype: Dtype | None = None) -> Self: """ Create :class:`pandas.RangeIndex` from a ``range`` object. + This method provides a way to create a :class:`pandas.RangeIndex` directly + from a Python ``range`` object. The resulting :class:`RangeIndex` will have + the same start, stop, and step values as the input ``range`` object. + It is particularly useful for constructing indices in an efficient and + memory-friendly manner. + + Parameters + ---------- + data : range + The range object to be converted into a RangeIndex. + name : str, default None + Name to be stored in the index. + dtype : Dtype or None + Data type for the RangeIndex. If None, the default integer type will + be used. + Returns ------- RangeIndex + See Also + -------- + RangeIndex : Immutable Index implementing a monotonic integer range. + Index : Immutable sequence used for indexing and alignment. + Examples -------- >>> pd.RangeIndex.from_range(range(5)) diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py index cdb670ee218b46..6dbc52a99e70c6 100644 --- a/pandas/core/window/doc.py +++ b/pandas/core/window/doc.py @@ -85,6 +85,63 @@ def create_section_header(header: str) -> str: """ ).replace("\n", "", 1) +template_pipe = """ +Apply a ``func`` with arguments to this %(klass)s object and return its result. + +Use `.pipe` when you want to improve readability by chaining together +functions that expect Series, DataFrames, GroupBy, Rolling, Expanding or Resampler +objects. +Instead of writing + +>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 +>>> g = lambda x, arg1: x * 5 / arg1 +>>> f = lambda x: x ** 4 +>>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, index=pd.date_range('2012-08-02', periods=4)) +>>> h(g(f(df.rolling('2D')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP + +You can write + +>>> (df.rolling('2D') +... .pipe(f) +... .pipe(g, arg1=1) +... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP + +which is much more readable. + +Parameters +---------- +func : callable or tuple of (callable, str) + Function to apply to this %(klass)s object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + %(klass)s object. +*args : iterable, optional + Positional arguments passed into `func`. +**kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + +Returns +------- +%(klass)s + The original object with the function `func` applied. + +See Also +-------- +Series.pipe : Apply a function with arguments to a series. +DataFrame.pipe: Apply a function with arguments to a dataframe. +apply : Apply function to each group instead of to the + full %(klass)s object. + +Notes +----- +See more `here +`_ + +Examples +-------- +%(examples)s +""" + numba_notes = ( "See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for " "extended documentation and performance considerations for the Numba engine.\n\n" diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index bff3a1660eba9d..6a7d0329ab6daf 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -5,9 +5,15 @@ TYPE_CHECKING, Any, Literal, + final, + overload, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + Substitution, + doc, +) from pandas.core.indexers.objects import ( BaseIndexer, @@ -20,6 +26,7 @@ kwargs_numeric_only, numba_notes, template_header, + template_pipe, template_returns, template_see_also, window_agg_numba_parameters, @@ -34,7 +41,11 @@ from collections.abc import Callable from pandas._typing import ( + Concatenate, + P, QuantileInterpolation, + Self, + T, WindowingRankType, ) @@ -241,6 +252,54 @@ def apply( kwargs=kwargs, ) + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + @final + @Substitution( + klass="Expanding", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each expanding window's maximum and minimum + value in one pass, you can do + + >>> df.expanding().pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 2.0 + 2012-08-05 3.0""", + ) + @Appender(template_pipe) + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return super().pipe(func, *args, **kwargs) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 385ffb901acf00..90c3cff975ff07 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -14,6 +14,8 @@ TYPE_CHECKING, Any, Literal, + final, + overload, ) import numpy as np @@ -26,7 +28,11 @@ import pandas._libs.window.aggregations as window_aggregations from pandas.compat._optional import import_optional_dependency from pandas.errors import DataError -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + Substitution, + doc, +) from pandas.core.dtypes.common import ( ensure_float64, @@ -81,6 +87,7 @@ kwargs_scipy, numba_notes, template_header, + template_pipe, template_returns, template_see_also, window_agg_numba_parameters, @@ -102,8 +109,12 @@ from pandas._typing import ( ArrayLike, + Concatenate, NDFrameT, QuantileInterpolation, + P, + Self, + T, WindowingRankType, npt, ) @@ -1529,6 +1540,30 @@ def apply_func(values, begin, end, min_periods, raw=raw): return apply_func + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return com.pipe(self, func, *args, **kwargs) + def sum( self, numeric_only: bool = False, @@ -2044,6 +2079,54 @@ def apply( kwargs=kwargs, ) + @overload + def pipe( + self, + func: Callable[Concatenate[Self, P], T], + *args: P.args, + **kwargs: P.kwargs, + ) -> T: ... + + @overload + def pipe( + self, + func: tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: ... + + @final + @Substitution( + klass="Rolling", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each rolling 2-day window's maximum and minimum + value in one pass, you can do + + >>> df.rolling('2D').pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 0.0 + 2012-08-03 1.0 + 2012-08-04 1.0 + 2012-08-05 1.0""", + ) + @Appender(template_pipe) + def pipe( + self, + func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], + *args: Any, + **kwargs: Any, + ) -> T: + return super().pipe(func, *args, **kwargs) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index c1d9f5ea4d25c4..4a05259a980876 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -261,6 +261,7 @@ class TestApi(Base): "JsonReader", "NaTType", "NAType", + "NoDefault", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index c52168ae48ca8d..ce71cfec535e4b 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,7 +4,10 @@ import numpy as np import pytest -from pandas.compat import WASM +from pandas.compat import ( + HAS_PYARROW, + WASM, +) from pandas.core.dtypes.common import is_number @@ -163,10 +166,10 @@ def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if series.dtype == "string" and func == "cumsum": + if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW: request.applymarker( pytest.mark.xfail( - raises=(TypeError, NotImplementedError), + raises=NotImplementedError, reason="TODO(infer_string) cumsum not yet implemented for string", ) ) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 5a59617ce5bd3c..fa48393dd183eb 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -837,6 +837,26 @@ def test_pandas_dtype_string_dtypes(string_storage): assert result == pd.StringDtype(string_storage, na_value=pd.NA) +def test_pandas_dtype_string_dtype_alias_with_storage(): + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[python]") + + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[pyarrow]") + + result = pandas_dtype("string[python]") + assert result == pd.StringDtype("python", na_value=pd.NA) + + if HAS_PYARROW: + result = pandas_dtype("string[pyarrow]") + assert result == pd.StringDtype("pyarrow", na_value=pd.NA) + else: + with pytest.raises( + ImportError, match="required for PyArrow backed StringArray" + ): + pandas_dtype("string[pyarrow]") + + @td.skip_if_installed("pyarrow") def test_construct_from_string_without_pyarrow_installed(): # GH 57928 diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 9a41a3a582c4a5..9a2f186c2a00bf 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): try: alt = ser.astype("float64") - except TypeError: - # e.g. Period can't be cast to float64 + except (TypeError, ValueError): + # e.g. Period can't be cast to float64 (TypeError) + # String can't be cast to float64 (ValueError) alt = ser.astype(object) result = getattr(ser, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c5f5a65b77eead..4fccf02e08bd6a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -393,13 +393,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if ( - pa.types.is_string(pa_type) - or pa.types.is_binary(pa_type) - or pa.types.is_decimal(pa_type) - ): + if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type): if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: return False + elif pa.types.is_string(pa_type): + if op_name == "cumprod": + return False elif pa.types.is_boolean(pa_type): if op_name in ["cumprod", "cummax", "cummin"]: return False @@ -414,6 +413,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request): pa_type = data.dtype.pyarrow_dtype op_name = all_numeric_accumulations + + if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]: + # https://github.com/pandas-dev/pandas/pull/60633 + # Doesn't fit test structure, tested in series/test_cumulative.py instead. + return + ser = pd.Series(data) if not self._supports_accumulation(ser, op_name): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e19351b2ad0584..6ce48e434d3294 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -24,6 +24,8 @@ from pandas.compat import HAS_PYARROW +from pandas.core.dtypes.base import StorageExtensionDtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -192,6 +194,14 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: and op_name in ("any", "all") ) + def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + assert isinstance(ser.dtype, StorageExtensionDtype) + return ser.dtype.storage == "pyarrow" and op_name in [ + "cummin", + "cummax", + "cumsum", + ] + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index de5029b9f18b22..43db234267f21f 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd import pandas._testing as tm @@ -136,9 +132,6 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) def test_data_frame_value_counts_dropna_false(nulls_fixture): # GH 41334 df = pd.DataFrame( diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 6b61fe8b05219d..7ada1884feb90e 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -2033,6 +2033,31 @@ def test_arithmetic_multiindex_align(): tm.assert_frame_equal(result, expected) +def test_arithmetic_multiindex_column_align(): + # GH#60498 + df1 = DataFrame( + data=100, + columns=MultiIndex.from_product( + [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"] + ), + index=["C1", "C2"], + ) + df2 = DataFrame( + data=np.array([[0.1, 0.25], [0.2, 0.45]]), + columns=MultiIndex.from_product([["1A", "1B"]], names=["Lev1"]), + index=["C1", "C2"], + ) + expected = DataFrame( + data=np.array([[10.0, 10.0, 25.0, 25.0], [20.0, 20.0, 45.0, 45.0]]), + columns=MultiIndex.from_product( + [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"] + ), + index=["C1", "C2"], + ) + result = df1 * df2 + tm.assert_frame_equal(result, expected) + + def test_bool_frame_mult_float(): # GH 18549 df = DataFrame(True, list("ab"), list("cd")) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index fde4dfeed9c55c..04b1456cdbea67 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -672,23 +672,10 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self, using_infer_string): - # Check for the warning that is raised when the mode - # results cannot be sorted - + def test_mode_sort_with_na(self, using_infer_string): df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - - # TODO(infer_string) avoid this UserWarning for python storage - warning = ( - None - if using_infer_string and df.A.dtype.storage == "pyarrow" - else UserWarning - ) - with tm.assert_produces_warning(warning, match="Unable to sort modes"): - result = df.mode(dropna=False) - result = result.sort_values(by="A").reset_index(drop=True) - + result = df.mode(dropna=False) tm.assert_frame_equal(result, expected) def test_mode_empty_df(self): diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index dae7fe2575c221..abc14d10514faf 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib import pandas as pd @@ -1675,7 +1673,6 @@ def test_unstack_multiple_no_empty_columns(self): expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) @@ -1923,7 +1920,6 @@ def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack): expected = frame.stack(future_stack=future_stack) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 36eb2f119ae254..5bae9b1fd9882a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError import pandas.util._test_decorators as td @@ -2468,12 +2466,13 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_groupby_none_column_name(): +def test_groupby_none_column_name(using_infer_string): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) - result = df.groupby(by=[None]).sum() - expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None)) + by = [np.nan] if using_infer_string else [None] + gb = df.groupby(by=by) + result = gb.sum() + expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=by[0])) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 06df8902f319c0..608158d40cf235 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -351,14 +351,11 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "str" and not index.dtype.storage == "python": - # TODO(infer_string): Make the errors consistent - with pytest.raises(NotImplementedError, match="i8"): - index.view("i8") else: msg = ( - "Cannot change data-type for array of references.|" - "Cannot change data-type for object array.|" + r"Cannot change data-type for array of references\.|" + r"Cannot change data-type for object array\.|" + r"Cannot change data-type for array of strings\.|" ) with pytest.raises(TypeError, match=msg): index.view("i8") diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index ff7d34c85c0159..953a9246da1cd2 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -41,6 +41,7 @@ def test_read_zipped_json(datapath): @td.skip_if_not_us_locale @pytest.mark.single_cpu +@pytest.mark.network def test_with_s3_url(compression, s3_public_bucket, s3so): # Bucket created in tests/io/conftest.py df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}')) diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index 8de289afe9ff96..12ae24b064c9d7 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -159,7 +159,7 @@ def test_build_decimal_series(self, dc): expected = OrderedDict( [ ("schema", schema), - ("data", [OrderedDict([("id", 0), ("a", 10.0)])]), + ("data", [OrderedDict([("id", 0), ("a", "10")])]), ] ) @@ -245,7 +245,7 @@ def test_to_json(self, da, dc, sa, ia): [ ("idx", 0), ("A", "2021-10-10T00:00:00.000"), - ("B", 10.0), + ("B", "10"), ("C", "pandas"), ("D", 10), ] diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index ad9dbf7554a8b2..5dc1272880c9bc 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,6 +1,5 @@ import datetime from datetime import timedelta -from decimal import Decimal from io import StringIO import json import os @@ -1413,6 +1412,7 @@ def test_read_inline_jsonl(self): tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu + @pytest.mark.network @td.skip_if_not_us_locale def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so): # GH17200 @@ -2012,6 +2012,7 @@ def test_json_multiindex(self): assert result == expected @pytest.mark.single_cpu + @pytest.mark.network def test_to_s3(self, s3_public_bucket, s3so): # GH 28375 mock_bucket_name, target_file = s3_public_bucket.name, "test.json" @@ -2025,12 +2026,8 @@ def test_to_s3(self, s3_public_bucket, s3so): timeout -= 0.1 assert timeout > 0, "Timed out waiting for file to appear on moto" - def test_json_pandas_nulls(self, nulls_fixture, request): + def test_json_pandas_nulls(self, nulls_fixture): # GH 31615 - if isinstance(nulls_fixture, Decimal): - mark = pytest.mark.xfail(reason="not implemented") - request.applymarker(mark) - expected_warning = None msg = ( "The default 'epoch' date format is deprecated and will be removed " diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 62118f1c82ebbd..c5ccc3b3f71841 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -57,56 +57,56 @@ def test_encode_decimal(self): sut = decimal.Decimal("1337.1337") encoded = ujson.ujson_dumps(sut, double_precision=15) decoded = ujson.ujson_loads(encoded) - assert decoded == 1337.1337 + assert decoded == "1337.1337" sut = decimal.Decimal("0.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "1.0" + assert encoded == '"0.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.95" sut = decimal.Decimal("0.94") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "0.9" + assert encoded == '"0.94"' decoded = ujson.ujson_loads(encoded) - assert decoded == 0.9 + assert decoded == "0.94" sut = decimal.Decimal("1.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "2.0" + assert encoded == '"1.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == 2.0 + assert decoded == "1.95" sut = decimal.Decimal("-1.95") encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == "-2.0" + assert encoded == '"-1.95"' decoded = ujson.ujson_loads(encoded) - assert decoded == -2.0 + assert decoded == "-1.95" sut = decimal.Decimal("0.995") encoded = ujson.ujson_dumps(sut, double_precision=2) - assert encoded == "1.0" + assert encoded == '"0.995"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.995" sut = decimal.Decimal("0.9995") encoded = ujson.ujson_dumps(sut, double_precision=3) - assert encoded == "1.0" + assert encoded == '"0.9995"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.9995" sut = decimal.Decimal("0.99999999999999944") encoded = ujson.ujson_dumps(sut, double_precision=15) - assert encoded == "1.0" + assert encoded == '"0.99999999999999944"' decoded = ujson.ujson_loads(encoded) - assert decoded == 1.0 + assert decoded == "0.99999999999999944" @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 476978aeab15a6..a7bb80727206e0 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1607,17 +1607,10 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(["foo", np.nan], dtype=object) + def test_mode_sort_with_na(self): s = Series([1, "foo", "foo", np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning, match="Unable to sort modes"): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - + expected = Series(["foo", np.nan], dtype=object) + result = s.mode(dropna=False) tm.assert_series_equal(result, expected) def test_mode_boolean_with_na(self): diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index a9d5486139b46b..89882d9d797c52 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -6,6 +6,8 @@ tests.frame.test_cumulative """ +import re + import numpy as np import pytest @@ -227,3 +229,55 @@ def test_cumprod_timedelta(self): ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() + + @pytest.mark.parametrize( + "data, op, skipna, expected_data", + [ + ([], "cumsum", True, []), + ([], "cumsum", False, []), + (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]), + (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], "cumsum", True, ["x", pd.NA, "xy"]), + (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cumsum", True, [pd.NA, "x", "xy"]), + ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummin", True, []), + ([], "cummin", False, []), + (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), + (["y", "z", "x"], "cummin", False, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", True, ["y", pd.NA, "x"]), + (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]), + ([pd.NA, "y", "x"], "cummin", True, [pd.NA, "y", "x"]), + ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummax", True, []), + ([], "cummax", False, []), + (["x", "z", "y"], "cummax", True, ["x", "z", "z"]), + (["x", "z", "y"], "cummax", False, ["x", "z", "z"]), + (["x", pd.NA, "y"], "cummax", True, ["x", pd.NA, "y"]), + (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cummax", True, [pd.NA, "x", "y"]), + ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cum_methods_pyarrow_strings( + self, pyarrow_string_dtype, data, op, skipna, expected_data + ): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(data, dtype=pyarrow_string_dtype) + method = getattr(ser, op) + expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + result = method(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) + msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'") + with pytest.raises(TypeError, match=msg): + ser.cumprod(skipna=skipna) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 34a6377b5786f2..30e6ebf0eed135 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -293,23 +293,12 @@ def test_startswith_endswith_validate_na(any_string_dtype): dtype=any_string_dtype, ) - dtype = ser.dtype - if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"): - msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser.str.startswith("kapow", na="baz") - msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser.str.endswith("bar", na="baz") - else: - # TODO(infer_string): don't surface pyarrow errors - import pyarrow as pa - - msg = "Could not convert 'baz' with type str: tried to convert to boolean" - with pytest.raises(pa.lib.ArrowInvalid, match=msg): - ser.str.startswith("kapow", na="baz") - with pytest.raises(pa.lib.ArrowInvalid, match=msg): - ser.str.endswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.startswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.endswith("bar", na="baz") @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 3b989e284ca256..541b0ea150ba60 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -6,6 +6,7 @@ import pandas.util._test_decorators as td from pandas import ( + ArrowDtype, DataFrame, Index, MultiIndex, @@ -113,8 +114,10 @@ def test_get_dummies_with_str_dtype(any_string_dtype): # GH#47872 @td.skip_if_no("pyarrow") def test_get_dummies_with_pa_str_dtype(any_string_dtype): + import pyarrow as pa + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype="str[pyarrow]") + result = s.str.get_dummies("|", dtype=ArrowDtype(pa.string())) expected = DataFrame( [ ["true", "true", "false"], @@ -122,6 +125,6 @@ def test_get_dummies_with_pa_str_dtype(any_string_dtype): ["false", "false", "false"], ], columns=list("abc"), - dtype="str[pyarrow]", + dtype=ArrowDtype(pa.string()), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 15eaa8c1674878..877b50e37670c0 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -177,6 +177,38 @@ def test_agg_nested_dicts(): r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) +@pytest.mark.parametrize( + "func,window_size", + [ + ( + "rolling", + 2, + ), + ( + "expanding", + None, + ), + ], +) +def test_pipe(func, window_size): + # Issue #57076 + df = DataFrame( + { + "B": np.random.default_rng(2).standard_normal(10), + "C": np.random.default_rng(2).standard_normal(10), + } + ) + r = getattr(df, func)(window_size) + + expected = r.max() - r.mean() + result = r.pipe(lambda x: x.max() - x.mean()) + tm.assert_frame_equal(result, expected) + + expected = r.max() - 2 * r.min() + result = r.pipe(lambda x, k: x.max() - k * x.min(), k=2) + tm.assert_frame_equal(result, expected) + + def test_count_nonnumeric_types(step): # GH12541 cols = [