Skip to content

Commit

Permalink
[ArrowStringArray] API: StringDtype parameterized by storage (python …
Browse files Browse the repository at this point in the history
…or pyarrow) (pandas-dev#39908)

Co-authored-by: Uwe L. Korn <[email protected]>
Co-authored-by: Tom Augspurger <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
4 people authored and JulianWgs committed Jul 3, 2021
1 parent 64a72c3 commit 2107dad
Show file tree
Hide file tree
Showing 29 changed files with 478 additions and 294 deletions.
23 changes: 10 additions & 13 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,41 +23,38 @@ class Factorize:
"int",
"uint",
"float",
"string",
"object",
"datetime64[ns]",
"datetime64[ns, tz]",
"Int64",
"boolean",
"string_arrow",
"string[pyarrow]",
],
]
param_names = ["unique", "sort", "dtype"]

def setup(self, unique, sort, dtype):
N = 10 ** 5
string_index = tm.makeStringIndex(N)
try:
from pandas.core.arrays.string_arrow import ArrowStringDtype

string_arrow = pd.array(string_index, dtype=ArrowStringDtype())
except ImportError:
string_arrow = None

if dtype == "string_arrow" and not string_arrow:
raise NotImplementedError
string_arrow = None
if dtype == "string[pyarrow]":
try:
string_arrow = pd.array(string_index, dtype="string[pyarrow]")
except ImportError:
raise NotImplementedError

data = {
"int": pd.Int64Index(np.arange(N)),
"uint": pd.UInt64Index(np.arange(N)),
"float": pd.Float64Index(np.random.randn(N)),
"string": string_index,
"object": string_index,
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
"datetime64[ns, tz]": pd.date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
"Int64": pd.array(np.arange(N), dtype="Int64"),
"boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
"string_arrow": string_arrow,
"string[pyarrow]": string_arrow,
}[dtype]
if not unique:
data = data.repeat(5)
Expand Down
8 changes: 3 additions & 5 deletions asv_bench/benchmarks/algos/isin.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ class IsIn:
"category[object]",
"category[int]",
"str",
"string",
"arrow_string",
"string[python]",
"string[pyarrow]",
]
param_names = ["dtype"]

Expand Down Expand Up @@ -62,9 +62,7 @@ def setup(self, dtype):
self.values = np.random.choice(arr, sample_size)
self.series = Series(arr).astype("category")

elif dtype in ["str", "string", "arrow_string"]:
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401

elif dtype in ["str", "string[python]", "string[pyarrow]"]:
try:
self.series = Series(tm.makeStringIndex(N), dtype=dtype)
except ImportError:
Expand Down
4 changes: 1 addition & 3 deletions asv_bench/benchmarks/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,10 @@


class Dtypes:
params = ["str", "string", "arrow_string"]
params = ["str", "string[python]", "string[pyarrow]"]
param_names = ["dtype"]

def setup(self, dtype):
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401

try:
self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
except ImportError:
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/arrays.rst
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ we recommend using :class:`StringDtype` (with the alias ``"string"``).
:template: autosummary/class_without_autosummary.rst

arrays.StringArray
arrays.ArrowStringArray

.. autosummary::
:toctree: api/
Expand Down
52 changes: 52 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,58 @@ a copy will no longer be made (:issue:`32960`)
The default behavior when not passing ``copy`` will remain unchanged, i.e.
a copy will be made.

.. _whatsnew_130.arrow_string:

PyArrow backed string data type
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

We've enhanced the :class:`StringDtype`, an extension type dedicated to string data.
(:issue:`39908`)

It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`. Use
pandas options or specify the dtype using ``dtype='string[pyarrow]'`` to allow the
StringArray to be backed by a PyArrow array instead of a NumPy array of Python objects.

The PyArrow backed StringArray requires pyarrow 1.0.0 or greater to be installed.

.. warning::

``string[pyarrow]`` is currently considered experimental. The implementation
and parts of the API may change without warning.

.. ipython:: python
pd.Series(['abc', None, 'def'], dtype=pd.StringDtype(storage="pyarrow"))
You can use the alias ``"string[pyarrow]"`` as well.

.. ipython:: python
s = pd.Series(['abc', None, 'def'], dtype="string[pyarrow]")
s
You can also create a PyArrow backed string array using pandas options.

.. ipython:: python
with pd.option_context("string_storage", "pyarrow"):
s = pd.Series(['abc', None, 'def'], dtype="string")
s
The usual string accessor methods work. Where appropriate, the return type of the Series
or columns of a DataFrame will also have string dtype.

.. ipython:: python
s.str.upper()
s.str.split('b', expand=True).dtypes
String accessor methods returning integers will return a value with :class:`Int64Dtype`

.. ipython:: python
s.str.count("a")
Centered Datetime-Like Rolling Windows
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
13 changes: 11 additions & 2 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
TimedeltaArray,
)
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
from pandas.core.arrays.string_ import StringDtype

from pandas.io.formats.printing import pprint_thing

Expand Down Expand Up @@ -638,12 +639,20 @@ def raise_assert_detail(obj, message, left, right, diff=None, index_values=None)

if isinstance(left, np.ndarray):
left = pprint_thing(left)
elif is_categorical_dtype(left) or isinstance(left, PandasDtype):
elif (
is_categorical_dtype(left)
or isinstance(left, PandasDtype)
or isinstance(left, StringDtype)
):
left = repr(left)

if isinstance(right, np.ndarray):
right = pprint_thing(right)
elif is_categorical_dtype(right) or isinstance(right, PandasDtype):
elif (
is_categorical_dtype(right)
or isinstance(right, PandasDtype)
or isinstance(right, StringDtype)
):
right = repr(right)

msg += f"""
Expand Down
2 changes: 2 additions & 0 deletions pandas/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
See :ref:`extending.extension-types` for more.
"""
from pandas.core.arrays import (
ArrowStringArray,
BooleanArray,
Categorical,
DatetimeArray,
Expand All @@ -18,6 +19,7 @@
)

__all__ = [
"ArrowStringArray",
"BooleanArray",
"Categorical",
"DatetimeArray",
Expand Down
38 changes: 27 additions & 11 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,24 +1120,42 @@ def string_dtype(request):

@pytest.fixture(
params=[
"string",
"string[python]",
pytest.param(
"arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
"string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
),
]
)
def nullable_string_dtype(request):
"""
Parametrized fixture for string dtypes.
* 'string'
* 'arrow_string'
* 'string[python]'
* 'string[pyarrow]'
"""
return request.param


@pytest.fixture(
params=[
"python",
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")),
]
)
def string_storage(request):
"""
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
Parametrized fixture for pd.options.mode.string_storage.
* 'python'
* 'pyarrow'
"""
return request.param


# Alias so we can test with cartesian product of string_storage
string_storage2 = string_storage


@pytest.fixture(params=tm.BYTES_DTYPES)
def bytes_dtype(request):
"""
Expand All @@ -1163,21 +1181,19 @@ def object_dtype(request):
@pytest.fixture(
params=[
"object",
"string",
"string[python]",
pytest.param(
"arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
"string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
),
]
)
def any_string_dtype(request):
"""
Parametrized fixture for string dtypes.
* 'object'
* 'string'
* 'arrow_string'
* 'string[python]'
* 'string[pyarrow]'
"""
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401

return request.param


Expand Down
2 changes: 2 additions & 0 deletions pandas/core/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@
)
from pandas.core.arrays.sparse import SparseArray
from pandas.core.arrays.string_ import StringArray
from pandas.core.arrays.string_arrow import ArrowStringArray
from pandas.core.arrays.timedeltas import TimedeltaArray

__all__ = [
"ExtensionArray",
"ExtensionOpsMixin",
"ExtensionScalarOpsMixin",
"ArrowStringArray",
"BaseMaskedArray",
"BooleanArray",
"Categorical",
Expand Down
Loading

0 comments on commit 2107dad

Please sign in to comment.