Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

str accessor inconsistencies #4334 #4339

Merged
merged 9 commits into from
Aug 15, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ Bug fixes
~~~~~~~~~
- Fixed a bug in backend caused by basic installation of Dask (:issue:`4164`, :pull:`4318`)
`Sam Morley <https://github.com/inakleinbottle>`_.
- Fixed inconsistencies between docstring and functionality for :py.meth:`DataArray.str.get`
and :py.meth:`DataArray.str.wrap` (:issue:`4334`). By `Mathias Hauser <https://github.com/mathause>`_.


Documentation
Expand Down
72 changes: 29 additions & 43 deletions xarray/core/accessor_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@

import codecs
import re
import textwrap

import numpy as np

Expand Down Expand Up @@ -90,7 +89,7 @@ def _apply(self, f, dtype=None):

def len(self):
"""
Compute the length of each element in the array.
Compute the length of each string in the array.

Returns
-------
Expand All @@ -104,9 +103,9 @@ def __getitem__(self, key):
else:
return self.get(key)

def get(self, i):
def get(self, i, default=""):
"""
Extract element from indexable in each element in the array.
Extract character number `i` from each string in the array.

Parameters
----------
Expand All @@ -120,12 +119,18 @@ def get(self, i):
-------
items : array of objects
"""
obj = slice(-1, None) if i == -1 else slice(i, i + 1)
return self._apply(lambda x: x[obj])
s = slice(-1, None) if i == -1 else slice(i, i + 1)

def f(x):
item = x[s]

return item if item else default

return self._apply(f)

def slice(self, start=None, stop=None, step=None):
"""
Slice substrings from each element in the array.
Slice substrings from each string in the array.

Parameters
----------
Expand Down Expand Up @@ -359,7 +364,7 @@ def count(self, pat, flags=0):

def startswith(self, pat):
"""
Test if the start of each string element matches a pattern.
Test if the start of each string in the array matches a pattern.

Parameters
----------
Expand All @@ -378,7 +383,7 @@ def startswith(self, pat):

def endswith(self, pat):
"""
Test if the end of each string element matches a pattern.
Test if the end of each string in the array matches a pattern.

Parameters
----------
Expand Down Expand Up @@ -432,8 +437,7 @@ def pad(self, width, side="left", fillchar=" "):

def center(self, width, fillchar=" "):
"""
Filling left and right side of strings in the array with an
additional character.
Pad left and right side of each string in the array.

Parameters
----------
Expand All @@ -451,8 +455,7 @@ def center(self, width, fillchar=" "):

def ljust(self, width, fillchar=" "):
"""
Filling right side of strings in the array with an additional
character.
Pad right side of each string in the array.

Parameters
----------
Expand All @@ -470,7 +473,7 @@ def ljust(self, width, fillchar=" "):

def rjust(self, width, fillchar=" "):
"""
Filling left side of strings in the array with an additional character.
Pad left side of each string in the array.

Parameters
----------
Expand All @@ -488,7 +491,7 @@ def rjust(self, width, fillchar=" "):

def zfill(self, width):
"""
Pad strings in the array by prepending '0' characters.
Pad each string in the array by prepending '0' characters.

Strings in the array are padded with '0' characters on the
left of the string to reach a total string length `width`. Strings
Expand All @@ -508,7 +511,7 @@ def zfill(self, width):

def contains(self, pat, case=True, flags=0, regex=True):
"""
Test if pattern or regex is contained within a string of the array.
Test if pattern or regex is contained within each string of the array.

Return boolean array based on whether a given pattern or regex is
contained within a string of the array.
Expand Down Expand Up @@ -554,7 +557,7 @@ def contains(self, pat, case=True, flags=0, regex=True):

def match(self, pat, case=True, flags=0):
"""
Determine if each string matches a regular expression.
Determine if each string in the array matches a regular expression.

Parameters
----------
Expand Down Expand Up @@ -613,7 +616,7 @@ def strip(self, to_strip=None, side="both"):

def lstrip(self, to_strip=None):
"""
Remove leading and trailing characters.
Remove leading characters.

Strip whitespaces (including newlines) or a set of specified characters
from each string in the array from the left side.
Expand All @@ -633,7 +636,7 @@ def lstrip(self, to_strip=None):

def rstrip(self, to_strip=None):
"""
Remove leading and trailing characters.
Remove trailing characters.

Strip whitespaces (including newlines) or a set of specified characters
from each string in the array from the right side.
Expand All @@ -653,8 +656,7 @@ def rstrip(self, to_strip=None):

def wrap(self, width, **kwargs):
"""
Wrap long strings in the array to be formatted in paragraphs with
length less than a given width.
Wrap long strings in the array in paragraphs with length less than `width`.

This method has the same keyword parameters and defaults as
:class:`textwrap.TextWrapper`.
Expand All @@ -663,38 +665,22 @@ def wrap(self, width, **kwargs):
----------
width : int
Maximum line-width
expand_tabs : bool, optional
If true, tab characters will be expanded to spaces (default: True)
replace_whitespace : bool, optional
If true, each whitespace character (as defined by
string.whitespace) remaining after tab expansion will be replaced
by a single space (default: True)
drop_whitespace : bool, optional
If true, whitespace that, after wrapping, happens to end up at the
beginning or end of a line is dropped (default: True)
break_long_words : bool, optional
If true, then words longer than width will be broken in order to
ensure that no lines are longer than width. If it is false, long
words will not be broken, and some lines may be longer than width.
(default: True)
break_on_hyphens : bool, optional
If true, wrapping will occur preferably on whitespace and right
after hyphens in compound words, as it is customary in English. If
false, only whitespaces will be considered as potentially good
places for line breaks, but you need to set break_long_words to
false if you want truly insecable words. (default: True)
kwargs
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring was quite incomplete so I though it best to just point to: https://docs.python.org/3/library/textwrap.html#textwrap.TextWrapper

keyword arguments passed into :class:`textwrap.TextWrapper.

Returns
-------
wrapped : same type as values
"""
tw = textwrap.TextWrapper(width=width)
import textwrap

tw = textwrap.TextWrapper(width=width, **kwargs)
f = lambda x: "\n".join(tw.wrap(x))
return self._apply(f)

def translate(self, table):
"""
Map all characters in the string through the given mapping table.
Map characters of each string through the given mapping table.

Parameters
----------
Expand Down
35 changes: 29 additions & 6 deletions xarray/tests/test_accessor_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ def test_wrap():
)

# expected values
xp = xr.DataArray(
expected = xr.DataArray(
[
"hello world",
"hello world!",
Expand All @@ -610,15 +610,29 @@ def test_wrap():
]
)

rs = values.str.wrap(12, break_long_words=True)
assert_equal(rs, xp)
result = values.str.wrap(12, break_long_words=True)
assert_equal(result, expected)

# test with pre and post whitespace (non-unicode), NaN, and non-ascii
# Unicode
values = xr.DataArray([" pre ", "\xac\u20ac\U00008000 abadcafe"])
xp = xr.DataArray([" pre", "\xac\u20ac\U00008000 ab\nadcafe"])
rs = values.str.wrap(6)
assert_equal(rs, xp)
expected = xr.DataArray([" pre", "\xac\u20ac\U00008000 ab\nadcafe"])
result = values.str.wrap(6)
assert_equal(result, expected)


def test_wrap_kwargs_passed():
# GH4334

values = xr.DataArray(" hello world ")

result = values.str.wrap(7)
expected = xr.DataArray(" hello\nworld")
assert_equal(result, expected)

result = values.str.wrap(7, drop_whitespace=False)
expected = xr.DataArray(" hello\n world\n ")
assert_equal(result, expected)


def test_get(dtype):
Expand All @@ -642,6 +656,15 @@ def test_get(dtype):
assert_equal(result, expected)


def test_get_default(dtype):
# GH4334
values = xr.DataArray(["a_b", "c", ""]).astype(dtype)

result = values.str.get(2, "default")
expected = xr.DataArray(["b", "default", "default"]).astype(dtype)
assert_equal(result, expected)


def test_encode_decode():
data = xr.DataArray(["a", "b", "a\xe4"])
encoded = data.str.encode("utf-8")
Expand Down