diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index aecc609df574ea..e48a2060a3b34e 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -23,12 +23,12 @@ class Factorize: "int", "uint", "float", - "string", + "object", "datetime64[ns]", "datetime64[ns, tz]", "Int64", "boolean", - "string_arrow", + "string[pyarrow]", ], ] param_names = ["unique", "sort", "dtype"] @@ -36,28 +36,25 @@ class Factorize: def setup(self, unique, sort, dtype): N = 10 ** 5 string_index = tm.makeStringIndex(N) - try: - from pandas.core.arrays.string_arrow import ArrowStringDtype - - string_arrow = pd.array(string_index, dtype=ArrowStringDtype()) - except ImportError: - string_arrow = None - - if dtype == "string_arrow" and not string_arrow: - raise NotImplementedError + string_arrow = None + if dtype == "string[pyarrow]": + try: + string_arrow = pd.array(string_index, dtype="string[pyarrow]") + except ImportError: + raise NotImplementedError data = { "int": pd.Int64Index(np.arange(N)), "uint": pd.UInt64Index(np.arange(N)), "float": pd.Float64Index(np.random.randn(N)), - "string": string_index, + "object": string_index, "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), "Int64": pd.array(np.arange(N), dtype="Int64"), "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), - "string_arrow": string_arrow, + "string[pyarrow]": string_arrow, }[dtype] if not unique: data = data.repeat(5) diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 44245295beafc5..4b589816940142 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -25,8 +25,8 @@ class IsIn: "category[object]", "category[int]", "str", - "string", - "arrow_string", + "string[python]", + "string[pyarrow]", ] param_names = ["dtype"] @@ -62,9 +62,7 @@ def setup(self, dtype): self.values = np.random.choice(arr, sample_size) self.series = Series(arr).astype("category") - elif dtype in ["str", "string", "arrow_string"]: - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - + elif dtype in ["str", "string[python]", "string[pyarrow]"]: try: self.series = Series(tm.makeStringIndex(N), dtype=dtype) except ImportError: diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 2e109e59c1c6d5..32fbf4e6c7de3d 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -12,12 +12,10 @@ class Dtypes: - params = ["str", "string", "arrow_string"] + params = ["str", "string[python]", "string[pyarrow]"] param_names = ["dtype"] def setup(self, dtype): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - try: self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) except ImportError: diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 43e25094694881..c6fda85b0486d7 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -480,6 +480,7 @@ we recommend using :class:`StringDtype` (with the alias ``"string"``). :template: autosummary/class_without_autosummary.rst arrays.StringArray + arrays.ArrowStringArray .. autosummary:: :toctree: api/ diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 8b413808503ade..287ec69f65cd6d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -171,6 +171,58 @@ a copy will no longer be made (:issue:`32960`) The default behavior when not passing ``copy`` will remain unchanged, i.e. a copy will be made. +.. _whatsnew_130.arrow_string: + +PyArrow backed string data type +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've enhanced the :class:`StringDtype`, an extension type dedicated to string data. +(:issue:`39908`) + +It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`. Use +pandas options or specify the dtype using ``dtype='string[pyarrow]'`` to allow the +StringArray to be backed by a PyArrow array instead of a NumPy array of Python objects. + +The PyArrow backed StringArray requires pyarrow 1.0.0 or greater to be installed. + +.. warning:: + + ``string[pyarrow]`` is currently considered experimental. The implementation + and parts of the API may change without warning. + +.. ipython:: python + + pd.Series(['abc', None, 'def'], dtype=pd.StringDtype(storage="pyarrow")) + +You can use the alias ``"string[pyarrow]"`` as well. + +.. ipython:: python + + s = pd.Series(['abc', None, 'def'], dtype="string[pyarrow]") + s + +You can also create a PyArrow backed string array using pandas options. + +.. ipython:: python + + with pd.option_context("string_storage", "pyarrow"): + s = pd.Series(['abc', None, 'def'], dtype="string") + s + +The usual string accessor methods work. Where appropriate, the return type of the Series +or columns of a DataFrame will also have string dtype. + +.. ipython:: python + + s.str.upper() + s.str.split('b', expand=True).dtypes + +String accessor methods returning integers will return a value with :class:`Int64Dtype` + +.. ipython:: python + + s.str.count("a") + Centered Datetime-Like Rolling Windows ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 96d010b487a790..1942e07d1b562a 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -48,6 +48,7 @@ TimedeltaArray, ) from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +from pandas.core.arrays.string_ import StringDtype from pandas.io.formats.printing import pprint_thing @@ -638,12 +639,20 @@ def raise_assert_detail(obj, message, left, right, diff=None, index_values=None) if isinstance(left, np.ndarray): left = pprint_thing(left) - elif is_categorical_dtype(left) or isinstance(left, PandasDtype): + elif ( + is_categorical_dtype(left) + or isinstance(left, PandasDtype) + or isinstance(left, StringDtype) + ): left = repr(left) if isinstance(right, np.ndarray): right = pprint_thing(right) - elif is_categorical_dtype(right) or isinstance(right, PandasDtype): + elif ( + is_categorical_dtype(right) + or isinstance(right, PandasDtype) + or isinstance(right, StringDtype) + ): right = repr(right) msg += f""" diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 0fa070b6e4fc4e..89d362eb77e688 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -4,6 +4,7 @@ See :ref:`extending.extension-types` for more. """ from pandas.core.arrays import ( + ArrowStringArray, BooleanArray, Categorical, DatetimeArray, @@ -18,6 +19,7 @@ ) __all__ = [ + "ArrowStringArray", "BooleanArray", "Categorical", "DatetimeArray", diff --git a/pandas/conftest.py b/pandas/conftest.py index 329023ed7ba6a5..e106f7f425fa09 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1120,9 +1120,9 @@ def string_dtype(request): @pytest.fixture( params=[ - "string", + "string[python]", pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") ), ] ) @@ -1130,14 +1130,32 @@ def nullable_string_dtype(request): """ Parametrized fixture for string dtypes. - * 'string' - * 'arrow_string' + * 'string[python]' + * 'string[pyarrow]' + """ + return request.param + + +@pytest.fixture( + params=[ + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")), + ] +) +def string_storage(request): """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + Parametrized fixture for pd.options.mode.string_storage. + * 'python' + * 'pyarrow' + """ return request.param +# Alias so we can test with cartesian product of string_storage +string_storage2 = string_storage + + @pytest.fixture(params=tm.BYTES_DTYPES) def bytes_dtype(request): """ @@ -1163,9 +1181,9 @@ def object_dtype(request): @pytest.fixture( params=[ "object", - "string", + "string[python]", pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") ), ] ) @@ -1173,11 +1191,9 @@ def any_string_dtype(request): """ Parametrized fixture for string dtypes. * 'object' - * 'string' - * 'arrow_string' + * 'string[python]' + * 'string[pyarrow]' """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - return request.param diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 22f15ca9650db5..e301e82a0ee758 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -17,12 +17,14 @@ ) from pandas.core.arrays.sparse import SparseArray from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.string_arrow import ArrowStringArray from pandas.core.arrays.timedeltas import TimedeltaArray __all__ = [ "ExtensionArray", "ExtensionOpsMixin", "ExtensionScalarOpsMixin", + "ArrowStringArray", "BaseMaskedArray", "BooleanArray", "Categorical", diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ab1dadf4d2dfa0..8d150c8f6ad3d1 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,9 +1,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, +) import numpy as np +from pandas._config import get_option + from pandas._libs import ( lib, missing as libmissing, @@ -14,6 +19,7 @@ Scalar, type_t, ) +from pandas.compat import pa_version_under1p0 from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ( @@ -37,6 +43,7 @@ IntegerArray, PandasArray, ) +from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array @@ -62,6 +69,11 @@ class StringDtype(ExtensionDtype): In particular, StringDtype.na_value may change to no longer be ``numpy.nan``. + Parameters + ---------- + storage : {"python", "pyarrow"}, optional + If not given, the value of ``pd.options.mode.string_storage``. + Attributes ---------- None @@ -73,20 +85,93 @@ class StringDtype(ExtensionDtype): Examples -------- >>> pd.StringDtype() - StringDtype + string[python] + + >>> pd.StringDtype(storage="pyarrow") + string[pyarrow] """ name = "string" #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA + _metadata = ("storage",) + + def __init__(self, storage=None): + if storage is None: + storage = get_option("mode.string_storage") + if storage not in {"python", "pyarrow"}: + raise ValueError( + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + ) + if storage == "pyarrow" and pa_version_under1p0: + raise ImportError( + "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + ) + + self.storage = storage @property def type(self) -> type[str]: return str @classmethod - def construct_array_type(cls) -> type_t[StringArray]: + def construct_from_string(cls, string): + """ + Construct a StringDtype from a string. + + Parameters + ---------- + string : str + The type of the name. The storage type will be taking from `string`. + Valid options and their storage types are + + ========================== ============================================== + string result storage + ========================== ============================================== + ``'string'`` pd.options.mode.string_storage, default python + ``'string[python]'`` python + ``'string[pyarrow]'`` pyarrow + ========================== ============================================== + + Returns + ------- + StringDtype + + Raise + ----- + TypeError + If the string is not a valid option. + + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + if string == "string": + return cls() + elif string == "string[python]": + return cls(storage="python") + elif string == "string[pyarrow]": + return cls(storage="pyarrow") + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + def __eq__(self, other: Any) -> bool: + if isinstance(other, str) and other == "string": + return True + return super().__eq__(other) + + def __hash__(self) -> int: + # custom __eq__ so have to override __hash__ + return super().__hash__() + + # https://github.com/pandas-dev/pandas/issues/36126 + # error: Signature of "construct_array_type" incompatible with supertype + # "ExtensionDtype" + def construct_array_type( # type: ignore[override] + self, + ) -> type_t[BaseStringArray]: """ Return the array type associated with this dtype. @@ -94,30 +179,44 @@ def construct_array_type(cls) -> type_t[StringArray]: ------- type """ - return StringArray + from pandas.core.arrays.string_arrow import ArrowStringArray + + if self.storage == "python": + return StringArray + else: + return ArrowStringArray + + def __repr__(self): + return f"string[{self.storage}]" - def __repr__(self) -> str: - return "StringDtype" + def __str__(self): + return self.name def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray - ) -> StringArray: + ) -> BaseStringArray: """ Construct StringArray from pyarrow Array/ChunkedArray. """ - import pyarrow + if self.storage == "pyarrow": + from pandas.core.arrays.string_arrow import ArrowStringArray - if isinstance(array, pyarrow.Array): - chunks = [array] + return ArrowStringArray(array) else: - # pyarrow.ChunkedArray - chunks = array.chunks - results = [] - for arr in chunks: - # using _from_sequence to ensure None is converted to NA - str_arr = StringArray._from_sequence(np.array(arr)) - results.append(str_arr) + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # using _from_sequence to ensure None is converted to NA + str_arr = StringArray._from_sequence(np.array(arr)) + results.append(str_arr) if results: return StringArray._concat_same_type(results) @@ -125,7 +224,11 @@ def __from_arrow__( return StringArray(np.array([], dtype="object")) -class StringArray(PandasArray): +class BaseStringArray(ExtensionArray): + pass + + +class StringArray(BaseStringArray, PandasArray): """ Extension array for string data. @@ -210,7 +313,7 @@ def __init__(self, values, copy=False): super().__init__(values, copy=copy) # error: Incompatible types in assignment (expression has type "StringDtype", # variable has type "PandasDtype") - NDArrayBacked.__init__(self, self._ndarray, StringDtype()) + NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) if not isinstance(values, type(self)): self._validate() @@ -226,8 +329,9 @@ def _validate(self): @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): - if dtype: - assert dtype == "string" + if dtype and not (isinstance(dtype, str) and dtype == "string"): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, StringDtype) and dtype.storage == "python" from pandas.core.arrays.masked import BaseMaskedArray @@ -247,7 +351,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype()) + NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) return new_string_array @@ -416,7 +520,7 @@ def _str_map( from pandas.arrays import BooleanArray if dtype is None: - dtype = StringDtype() + dtype = StringDtype(storage="python") if na_value is None: na_value = self.dtype.na_value diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3cf471e381da90..ab8599f0f05ba1 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -11,16 +11,12 @@ import numpy as np -from pandas._libs import ( - lib, - missing as libmissing, -) +from pandas._libs import lib from pandas._typing import ( Dtype, NpDtype, PositionalIndexer, Scalar, - type_t, ) from pandas.compat import ( pa_version_under1p0, @@ -43,7 +39,6 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna from pandas.core import missing @@ -52,7 +47,10 @@ from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype -from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_ import ( + BaseStringArray, + StringDtype, +) from pandas.core.indexers import ( check_array_indexer, validate_indices, @@ -86,99 +84,12 @@ def _chk_pyarrow_available() -> None: raise ImportError(msg) -@register_extension_dtype -class ArrowStringDtype(StringDtype): - """ - Extension dtype for string data in a ``pyarrow.ChunkedArray``. - - .. versionadded:: 1.2.0 - - .. warning:: - - ArrowStringDtype is considered experimental. The implementation and - parts of the API may change without warning. - - Attributes - ---------- - None - - Methods - ------- - None - - Examples - -------- - >>> from pandas.core.arrays.string_arrow import ArrowStringDtype - >>> ArrowStringDtype() - ArrowStringDtype - """ - - name = "arrow_string" - - #: StringDtype.na_value uses pandas.NA - na_value = libmissing.NA - - def __init__(self): - _chk_pyarrow_available() - - @property - def type(self) -> type[str]: - return str - - @classmethod - def construct_array_type(cls) -> type_t[ArrowStringArray]: # type: ignore[override] - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ArrowStringArray - - def __hash__(self) -> int: - return hash("ArrowStringDtype") - - def __repr__(self) -> str: - return "ArrowStringDtype" - - def __from_arrow__( # type: ignore[override] - self, array: pa.Array | pa.ChunkedArray - ) -> ArrowStringArray: - """ - Construct StringArray from pyarrow Array/ChunkedArray. - """ - return ArrowStringArray(array) - - def __eq__(self, other) -> bool: - """Check whether 'other' is equal to self. - - By default, 'other' is considered equal if - * it's a string matching 'self.name'. - * it's an instance of this type. - - Parameters - ---------- - other : Any - - Returns - ------- - bool - """ - if isinstance(other, ArrowStringDtype): - return True - elif isinstance(other, str) and other == "arrow_string": - return True - else: - return False - - # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from # ObjectStringArrayMixin because we want to have the object-dtype based methods as # fallback for the ones that pyarrow doesn't yet support -class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): +class ArrowStringArray(OpsMixin, BaseStringArray, ObjectStringArrayMixin): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -216,14 +127,14 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): Examples -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]") ['This is', 'some text', , 'data.'] - Length: 4, dtype: arrow_string + Length: 4, dtype: string """ def __init__(self, values): - self._dtype = ArrowStringDtype() + self._dtype = StringDtype(storage="pyarrow") if isinstance(values, pa.Array): self._data = pa.chunked_array([values]) elif isinstance(values, pa.ChunkedArray): @@ -242,6 +153,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) _chk_pyarrow_available() + if dtype and not (isinstance(dtype, str) and dtype == "string"): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" + if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and # numerical issues with Float32Dtype @@ -261,9 +176,9 @@ def _from_sequence_of_strings( return cls._from_sequence(strings, dtype=dtype, copy=copy) @property - def dtype(self) -> ArrowStringDtype: + def dtype(self) -> StringDtype: """ - An instance of 'ArrowStringDtype'. + An instance of 'string[pyarrow]'. """ return self._dtype @@ -761,7 +676,8 @@ def astype(self, dtype, copy=True): # ------------------------------------------------------------------------ # String methods interface - _str_na_value = ArrowStringDtype.na_value + # error: Cannot determine type of 'na_value' + _str_na_value = StringDtype.na_value # type: ignore[has-type] def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 0215deb6b664ec..992dbae21f0fd1 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -535,6 +535,19 @@ def use_inf_as_na_cb(key): ) +string_storage_doc = """ +: string + The default storage for StringDtype. +""" + +with cf.config_prefix("mode"): + cf.register_option( + "string_storage", + "python", + string_storage_doc, + validator=is_one_of_factory(["python", "pyarrow"]), + ) + # Set up the io.excel specific reader configuration. reader_engine_doc = """ : string diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 330902b402324e..379495eaa48d5c 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -109,18 +109,22 @@ def array( Currently, pandas will infer an extension dtype for sequences of - ============================== ===================================== + ============================== ======================================= Scalar Type Array Type - ============================== ===================================== + ============================== ======================================= :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` :class:`float` :class:`pandas.arrays.FloatingArray` - :class:`str` :class:`pandas.arrays.StringArray` + :class:`str` :class:`pandas.arrays.StringArray` or + :class:`pandas.arrays.ArrowStringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` - ============================== ===================================== + ============================== ======================================= + + The ExtensionArray created when the scalar type is :class:`str` is determined by + ``pd.options.mode.string_storage`` if the dtype is not explicitly given. For all other cases, NumPy's usual inference rules will be used. @@ -236,6 +240,14 @@ def array( ['a', , 'c'] Length: 3, dtype: string + >>> with pd.option_context("string_storage", "pyarrow"): + ... arr = pd.array(["a", None, "c"]) + ... + >>> arr + + ['a', , 'c'] + Length: 3, dtype: string + >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) ['2000-01-01', '2000-01-01'] @@ -289,9 +301,9 @@ def array( IntervalArray, PandasArray, PeriodArray, - StringArray, TimedeltaArray, ) + from pandas.core.arrays.string_ import StringDtype if lib.is_scalar(data): msg = f"Cannot pass scalar '{data}' to 'pandas.array'." @@ -332,7 +344,8 @@ def array( return TimedeltaArray._from_sequence(data, copy=copy) elif inferred_dtype == "string": - return StringArray._from_sequence(data, copy=copy) + # StringArray/ArrowStringArray depending on pd.options.mode.string_storage + return StringDtype().construct_array_type()._from_sequence(data, copy=copy) elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 4abb5d98202f6f..f432cd27d1269c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -419,18 +419,14 @@ def maybe_cast_to_extension_array( ------- ExtensionArray or obj """ - from pandas.core.arrays.string_ import StringArray - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_ import BaseStringArray assert isinstance(cls, type), f"must pass a type: {cls}" assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" assert issubclass(cls, ABCExtensionArray), assertion_msg # Everything can be converted to StringArrays, but we may not want to convert - if ( - issubclass(cls, (StringArray, ArrowStringArray)) - and lib.infer_dtype(obj) != "string" - ): + if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string": return obj try: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7643019ff8c555..aa867ae4dd4014 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3080,7 +3080,7 @@ def _result_dtype(arr): from pandas.core.arrays.string_ import StringDtype if isinstance(arr.dtype, StringDtype): - return arr.dtype.name + return arr.dtype else: return object diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 7ce4abe904f3bb..02bdb7f1815839 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -173,8 +173,7 @@ def scalar_rep(x): return self._str_map(scalar_rep, dtype=str) else: - from pandas.core.arrays.string_ import StringArray - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_ import BaseStringArray def rep(x, r): if x is libmissing.NA: @@ -186,7 +185,7 @@ def rep(x, r): repeats = np.asarray(repeats, dtype=object) result = libops.vec_binop(np.asarray(self), repeats, rep) - if isinstance(self, (StringArray, ArrowStringArray)): + if isinstance(self, BaseStringArray): # Not going through map, so we have to do this here. result = type(self)._from_sequence(result) return result diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index c9533e239abe03..5731f02430a9da 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -14,37 +14,17 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays.string_arrow import ( - ArrowStringArray, - ArrowStringDtype, -) - -skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0") - - -@pytest.fixture( - params=["string", pytest.param("arrow_string", marks=skip_if_no_pyarrow)] -) -def dtype(request): - return request.param +from pandas.core.arrays.string_arrow import ArrowStringArray @pytest.fixture -def dtype_object(dtype): - if dtype == "string": - return pd.StringDtype - else: - return ArrowStringDtype +def dtype(string_storage): + return pd.StringDtype(storage=string_storage) -@pytest.fixture( - params=[ - pd.arrays.StringArray, - pytest.param(ArrowStringArray, marks=skip_if_no_pyarrow), - ] -) -def cls(request): - return request.param +@pytest.fixture +def cls(dtype): + return dtype.construct_array_type() def test_repr(dtype): @@ -52,11 +32,11 @@ def test_repr(dtype): expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - expected = f"0 a\n1 \n2 b\nName: A, dtype: {dtype}" + expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype == "arrow_string" else "StringArray" - expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: {dtype}" + arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected @@ -94,7 +74,7 @@ def test_setitem_with_scalar_string(dtype): def test_astype_roundtrip(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "ValueError: Could not convert object to NumPy datetime" mark = pytest.mark.xfail(reason=reason, raises=ValueError) request.node.add_marker(mark) @@ -115,7 +95,7 @@ def test_astype_roundtrip(dtype, request): def test_add(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = ( "unsupported operand type(s) for +: 'ArrowStringArray' and " "'ArrowStringArray'" @@ -143,7 +123,7 @@ def test_add(dtype, request): def test_add_2d(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.node.add_marker(mark) @@ -159,7 +139,7 @@ def test_add_2d(dtype, request): def test_add_sequence(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "unsupported operand type(s) for +: 'ArrowStringArray' and 'list'" mark = pytest.mark.xfail(raises=TypeError, reason=reason) request.node.add_marker(mark) @@ -177,7 +157,7 @@ def test_add_sequence(dtype, request): def test_mul(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" mark = pytest.mark.xfail(raises=TypeError, reason=reason) request.node.add_marker(mark) @@ -258,7 +238,7 @@ def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, requ def test_comparison_methods_array(all_compare_operators, dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": mark = pytest.mark.xfail( raises=AssertionError, reason="left is not an ExtensionArray" ) @@ -366,7 +346,7 @@ def test_reduce(skipna, dtype): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("skipna", [True, False]) def test_min_max(method, skipna, dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "'ArrowStringArray' object has no attribute 'max'" mark = pytest.mark.xfail(raises=AttributeError, reason=reason) request.node.add_marker(mark) @@ -383,7 +363,7 @@ def test_min_max(method, skipna, dtype, request): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) def test_min_max_numpy(method, box, dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": if box is pd.array: raises = TypeError reason = "'<=' not supported between instances of 'str' and 'NoneType'" @@ -413,7 +393,7 @@ def test_reduce_missing(skipna, dtype): def test_fillna_args(dtype, request): # GH 37987 - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = ( "Regex pattern \"Cannot set non-string value '1' into " "a StringArray.\" does not match 'Scalar must be NA or str'" @@ -444,14 +424,14 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype == "arrow_string": + if dtype.storage == "pyarrow": expected = pa.chunked_array(expected) assert arr.equals(expected) @td.skip_if_no("pyarrow") -def test_arrow_roundtrip(dtype, dtype_object): +def test_arrow_roundtrip(dtype, string_storage2): # roundtrip possible from arrow 1.0.0 import pyarrow as pa @@ -459,15 +439,17 @@ def test_arrow_roundtrip(dtype, dtype_object): df = pd.DataFrame({"a": data}) table = pa.table(df) assert table.field("a").type == "string" - result = table.to_pandas() - assert isinstance(result["a"].dtype, dtype_object) - tm.assert_frame_equal(result, df) + with pd.option_context("string_storage", string_storage2): + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage2}]") + tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA @td.skip_if_no("pyarrow") -def test_arrow_load_from_zero_chunks(dtype, dtype_object): +def test_arrow_load_from_zero_chunks(dtype, string_storage2): # GH-41040 import pyarrow as pa @@ -477,9 +459,11 @@ def test_arrow_load_from_zero_chunks(dtype, dtype_object): assert table.field("a").type == "string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) - result = table.to_pandas() - assert isinstance(result["a"].dtype, dtype_object) - tm.assert_frame_equal(result, df) + with pd.option_context("string_storage", string_storage2): + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage2}]") + tm.assert_frame_equal(result, expected) def test_value_counts_na(dtype): @@ -523,10 +507,10 @@ def test_use_inf_as_na(values, expected, dtype): tm.assert_frame_equal(result, expected) -def test_memory_usage(dtype, request): +def test_memory_usage(dtype): # GH 33963 - if dtype == "arrow_string": + if dtype.storage == "pyarrow": pytest.skip("not applicable") series = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 3db8333798e367..c3f951adf7f893 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -5,16 +5,47 @@ from pandas.compat import pa_version_under1p0 -from pandas.core.arrays.string_arrow import ( - ArrowStringArray, - ArrowStringDtype, +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays.string_ import ( + StringArray, + StringDtype, ) +from pandas.core.arrays.string_arrow import ArrowStringArray - -@pytest.mark.skipif( +skip_if_no_pyarrow = pytest.mark.skipif( pa_version_under1p0, reason="pyarrow>=1.0.0 is required for PyArrow backed StringArray", ) + + +@skip_if_no_pyarrow +def test_eq_all_na(): + a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow")) + result = a == a + expected = pd.array([pd.NA, pd.NA], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_config(string_storage): + with pd.option_context("string_storage", string_storage): + assert StringDtype().storage == string_storage + result = pd.array(["a", "b"]) + assert result.dtype.storage == string_storage + + expected = ( + StringDtype(string_storage).construct_array_type()._from_sequence(["a", "b"]) + ) + tm.assert_equal(result, expected) + + +def test_config_bad_storage_raises(): + msg = re.escape("Value must be one of python|pyarrow") + with pytest.raises(ValueError, match=msg): + pd.options.mode.string_storage = "foo" + + +@skip_if_no_pyarrow @pytest.mark.parametrize("chunked", [True, False]) @pytest.mark.parametrize("array", ["numpy", "pyarrow"]) def test_constructor_not_string_type_raises(array, chunked): @@ -37,6 +68,55 @@ def test_constructor_not_string_type_raises(array, chunked): ArrowStringArray(arr) +@skip_if_no_pyarrow +def test_from_sequence_wrong_dtype_raises(): + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") + + with pd.option_context("string_storage", "pyarrow"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") + + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[python]") + + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") + + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + with pd.option_context("string_storage", "pyarrow"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) + + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) + + with pd.option_context("string_storage", "python"): + StringArray._from_sequence(["a", None, "c"], dtype="string") + + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype="string") + + StringArray._from_sequence(["a", None, "c"], dtype="string[python]") + + with pytest.raises(AssertionError, match=None): + StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") + + with pd.option_context("string_storage", "python"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) + + with pytest.raises(AssertionError, match=None): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) + + @pytest.mark.skipif( not pa_version_under1p0, reason="pyarrow is installed", @@ -45,7 +125,7 @@ def test_pyarrow_not_installed_raises(): msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed StringArray") with pytest.raises(ImportError, match=msg): - ArrowStringDtype() + StringDtype(storage="pyarrow") with pytest.raises(ImportError, match=msg): ArrowStringArray([]) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index bfe588883d9f36..61d56df485ab18 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -18,7 +18,6 @@ IntegerArray, IntervalArray, SparseArray, - StringArray, TimedeltaArray, ) from pandas.core.arrays import ( @@ -132,8 +131,16 @@ ([1, None], "Int16", pd.array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # String - (["a", None], "string", StringArray._from_sequence(["a", None])), - (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None])), + ( + ["a", None], + "string", + pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + ), + ( + ["a", None], + pd.StringDtype(), + pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + ), # Boolean ([True, None], "boolean", BooleanArray._from_sequence([True, None])), ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])), @@ -253,8 +260,14 @@ def test_array_copy(): ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])), ([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])), # string - (["a", "b"], StringArray._from_sequence(["a", "b"])), - (["a", None], StringArray._from_sequence(["a", None])), + ( + ["a", "b"], + pd.StringDtype().construct_array_type()._from_sequence(["a", "b"]), + ), + ( + ["a", None], + pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + ), # Boolean ([True, False], BooleanArray._from_sequence([True, False])), ([True, None], BooleanArray._from_sequence([True, None])), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index c6f8efe7b939ee..0bd10b36a8b5c4 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -298,7 +298,7 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings(self, arr1d, box, request): + def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage): if isinstance(arr1d, DatetimeArray): tz = arr1d.tz ts1, ts2 = arr1d[1:3] @@ -341,14 +341,17 @@ def test_searchsorted_castable_strings(self, arr1d, box, request): ): arr.searchsorted("foo") - with pytest.raises( - TypeError, - match=re.escape( - f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got 'StringArray' instead." - ), - ): - arr.searchsorted([str(arr[1]), "baz"]) + arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray" + + with pd.option_context("string_storage", string_storage): + with pytest.raises( + TypeError, + match=re.escape( + f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " + f"or array of those. Got '{arr_type}' instead." + ), + ): + arr.searchsorted([str(arr[1]), "baz"]) def test_getitem_near_implementation_bounds(self): # We only check tz-naive for DTA bc the bounds are slightly different diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index 23a07b2031bf59..67a62978aa1bc8 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -2,12 +2,11 @@ import pandas as pd -pytest.importorskip("pyarrow", minversion="0.13.0") - -from pandas.tests.extension.arrow.arrays import ArrowStringDtype # isort:skip +pytest.importorskip("pyarrow", minversion="1.0.0") def test_constructor_from_list(): # GH 27673 - result = pd.Series(["E"], dtype=ArrowStringDtype()) - assert isinstance(result.dtype, ArrowStringDtype) + result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow")) + assert isinstance(result.dtype, pd.StringDtype) + assert result.dtype.storage == "pyarrow" diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 99a5666926e107..9c59c79f677deb 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -48,16 +48,14 @@ def test_astype_str(self, data): @pytest.mark.parametrize( "nullable_string_dtype", [ - "string", + "string[python]", pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") ), ], ) def test_astype_string(self, data, nullable_string_dtype): # GH-33465 - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - result = pd.Series(data[:5]).astype(nullable_string_dtype) expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 49aee76e10f6a2..3d0edb70d1ced5 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,16 +18,13 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas.core.arrays.string_ import StringDtype -from pandas.core.arrays.string_arrow import ArrowStringDtype from pandas.tests.extension import base def split_array(arr): - if not isinstance(arr.dtype, ArrowStringDtype): + if arr.dtype.storage != "pyarrow": pytest.skip("chunked array n/a") def _split_array(arr): @@ -49,16 +46,9 @@ def chunked(request): return request.param -@pytest.fixture( - params=[ - StringDtype, - pytest.param( - ArrowStringDtype, marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), - ] -) -def dtype(request): - return request.param() +@pytest.fixture +def dtype(string_storage): + return StringDtype(storage=string_storage) @pytest.fixture @@ -104,24 +94,28 @@ def data_for_grouping(dtype, chunked): class TestDtype(base.BaseDtypeTests): - pass + def test_eq_with_str(self, dtype): + assert dtype == f"string[{dtype.storage}]" + super().test_eq_with_str(dtype) class TestInterface(base.BaseInterfaceTests): def test_view(self, data, request): - if isinstance(data.dtype, ArrowStringDtype): + if data.dtype.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_view(data) class TestConstructors(base.BaseConstructorsTests): - pass + def test_from_dtype(self, data): + # base test uses string representation of dtype + pass class TestReshaping(base.BaseReshapingTests): - def test_transpose(self, data, dtype, request): - if isinstance(dtype, ArrowStringDtype): + def test_transpose(self, data, request): + if data.dtype.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_transpose(data) @@ -132,8 +126,8 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): - def test_setitem_preserves_views(self, data, dtype, request): - if isinstance(dtype, ArrowStringDtype): + def test_setitem_preserves_views(self, data, request): + if data.dtype.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_setitem_preserves_views(data) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 1583b3f91bea2c..881f8db3052403 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -584,10 +584,10 @@ def test_astype_empty_dtype_dict(self): @pytest.mark.parametrize( "data, dtype", [ - (["x", "y", "z"], "string"), + (["x", "y", "z"], "string[python]"), pytest.param( ["x", "y", "z"], - "arrow_string", + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0"), ), (["x", "y", "z"], "category"), @@ -598,8 +598,6 @@ def test_astype_empty_dtype_dict(self): @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): # https://github.com/pandas-dev/pandas/issues/35471 - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - df = DataFrame(Series(data, dtype=dtype)) if errors == "ignore": expected = df diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index dd7bf0aada4496..a2d539d784d3c6 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -9,7 +9,7 @@ class TestConvertDtypes: @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes(self, convert_integer, expected): + def test_convert_dtypes(self, convert_integer, expected, string_storage): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here df = pd.DataFrame( @@ -18,11 +18,12 @@ def test_convert_dtypes(self, convert_integer, expected): "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), } ) - result = df.convert_dtypes(True, True, convert_integer, False) + with pd.option_context("string_storage", string_storage): + result = df.convert_dtypes(True, True, convert_integer, False) expected = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=expected), - "b": pd.Series(["x", "y", "z"], dtype="string"), + "b": pd.Series(["x", "y", "z"], dtype=f"string[{string_storage}]"), } ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ae6425cd93ac54..d100c584b698a1 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -810,12 +810,11 @@ def test_additional_extension_arrays(self, pa): check_round_trip(df, pa) @td.skip_if_no("pyarrow", min_version="1.0.0") - def test_pyarrow_backed_string_array(self, pa): + def test_pyarrow_backed_string_array(self, pa, string_storage): # test ArrowStringArray supported through the __arrow_array__ protocol - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="arrow_string")}) - check_round_trip(df, pa, expected=df) + df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) + with pd.option_context("string_storage", string_storage): + check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) @td.skip_if_no("pyarrow") def test_additional_extension_types(self, pa): diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index ffaecf15763646..99a7ba910eb74c 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -249,10 +249,10 @@ def test_td64_series_astype_object(self): @pytest.mark.parametrize( "data, dtype", [ - (["x", "y", "z"], "string"), + (["x", "y", "z"], "string[python]"), pytest.param( ["x", "y", "z"], - "arrow_string", + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0"), ), (["x", "y", "z"], "category"), @@ -263,9 +263,6 @@ def test_td64_series_astype_object(self): @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): # https://github.com/pandas-dev/pandas/issues/35471 - - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - ser = Series(data, dtype=dtype) if errors == "ignore": expected = ser diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index 9a64877cb92ff0..d9d6641d54237d 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -11,7 +11,6 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 class TestUpdate: @@ -87,12 +86,12 @@ def test_update_from_non_series(self, series, other, expected): @pytest.mark.parametrize( "data, other, expected, dtype", [ - (["a", None], [None, "b"], ["a", "b"], "string"), + (["a", None], [None, "b"], ["a", "b"], "string[python]"), pytest.param( ["a", None], [None, "b"], ["a", "b"], - "arrow_string", + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0"), ), ([1, None], [None, 2], [1, 2], "Int64"), diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index ec8b5bfa11ad51..6cbf2dd606692e 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -6,6 +6,7 @@ MultiIndex, Series, _testing as tm, + get_option, ) from pandas.core import strings as strings @@ -128,7 +129,9 @@ def test_api_per_method( def test_api_for_categorical(any_string_method, any_string_dtype, request): # https://github.com/pandas-dev/pandas/issues/10661 - if any_string_dtype == "arrow_string": + if any_string_dtype == "string[pyarrow]" or ( + any_string_dtype == "string" and get_option("string_storage") == "pyarrow" + ): # unsupported operand type(s) for +: 'ArrowStringArray' and 'str' mark = pytest.mark.xfail(raises=TypeError, reason="Not Implemented") request.node.add_marker(mark)