Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enh: adding automated inferencing of format %Y-%m-%dT%H:%M in pyarrow #1292

Merged
merged 5 commits into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions narwhals/_arrow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,9 @@ def convert_str_slice_to_int_slice(
# Regex for date, time, separator and timezone components
DATE_RE = r"(?P<date>\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4})"
SEP_RE = r"(?P<sep>\s|T)"
TIME_RE = r"(?P<time>\d{2}:\d{2}:\d{2})" # \s*(?P<period>[AP]M)?)?
TIME_RE = r"(?P<time>\d{2}:\d{2}(?::\d{2})?)" # \s*(?P<period>[AP]M)?)?
HMS_RE = r"(?P<hms>\d{2}:\d{2}:\d{2})"
HM_RE = r"(?P<hm>\d{2}:\d{2})"
raisadz marked this conversation as resolved.
Show resolved Hide resolved
TZ_RE = r"(?P<tz>Z|[+-]\d{2}:?\d{2})" # Matches 'Z', '+02:00', '+0200', '+02', etc.
FULL_RE = rf"{DATE_RE}{SEP_RE}?{TIME_RE}?{TZ_RE}?$"

Expand Down Expand Up @@ -418,5 +420,12 @@ def _parse_date_format(arr: pa.Array) -> str:
def _parse_time_format(arr: pa.Array) -> str:
import pyarrow.compute as pc # ignore-banned-import

matches = pc.extract_regex(arr, pattern=TIME_RE)
return "%H:%M:%S" if pc.all(matches.is_valid()).as_py() else ""
format = ""
matches = pc.extract_regex(arr, pattern=HMS_RE)
if pc.all(matches.is_valid()).as_py():
format = "%H:%M:%S"
else:
matches = pc.extract_regex(arr, pattern=HM_RE)
if pc.all(matches.is_valid()).as_py():
format = "%H:%M"
return format
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opinionated take is to follow a similar pattern (pun unintended) as per date regex, i.e. having a format mapping:

TIME_FORMATS = (
    (HMS_RE, "%H:%M:%S"),
    (HM_RE, "%H:%M"),
)

and here loop through that and have a early return for the first that fully matches, otherwise return empty string

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the suggestion @FBruzzesi! I added a time formatting mapping similar to how it is done for dates, it looks much nicer now πŸŽ‰

52 changes: 44 additions & 8 deletions tests/expr_and_series/str/to_datetime_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,29 @@ def test_to_datetime_series(constructor_eager: ConstructorEager) -> None:
assert str(result) == expected


def test_to_datetime_infer_fmt(constructor: Constructor) -> None:
@pytest.mark.parametrize(
("data", "expected", "expected_cudf"),
[
(
{"a": ["2020-01-01T12:34:56"]},
"2020-01-01 12:34:56",
"2020-01-01T12:34:56.000000000",
),
(
{"a": ["2020-01-01T12:34"]},
"2020-01-01 12:34:00",
"2020-01-01T12:34:00.000000000",
),
],
)
def test_to_datetime_infer_fmt(
constructor: Constructor,
data: dict[str, list[str]],
expected: str,
expected_cudf: str,
) -> None:
if "cudf" in str(constructor): # pragma: no cover
expected = "2020-01-01T12:34:56.000000000"
else:
expected = "2020-01-01 12:34:56"
expected = expected_cudf

result = (
nw.from_native(constructor(data))
Expand All @@ -63,11 +81,29 @@ def test_to_datetime_infer_fmt(constructor: Constructor) -> None:
assert str(result) == expected


def test_to_datetime_series_infer_fmt(constructor_eager: ConstructorEager) -> None:
@pytest.mark.parametrize(
("data", "expected", "expected_cudf"),
[
(
{"a": ["2020-01-01T12:34:56"]},
"2020-01-01 12:34:56",
"2020-01-01T12:34:56.000000000",
),
(
{"a": ["2020-01-01T12:34"]},
"2020-01-01 12:34:00",
"2020-01-01T12:34:00.000000000",
),
],
)
def test_to_datetime_series_infer_fmt(
constructor_eager: ConstructorEager,
data: dict[str, list[str]],
expected: str,
expected_cudf: str,
) -> None:
if "cudf" in str(constructor_eager): # pragma: no cover
expected = "2020-01-01T12:34:56.000000000"
else:
expected = "2020-01-01 12:34:56"
expected = expected_cudf

result = (
nw.from_native(constructor_eager(data), eager_only=True)["a"].str.to_datetime()
Expand Down
Loading