-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Datetime parsing (PDEP-4): allow mixture of ISO formatted strings #50939
Changes from 16 commits
044948f
f4e1392
9f06d80
d7f6056
8952a0e
6e6d579
3d65dbf
b247bbd
2f66f87
eb36d8c
262be89
e01b6ee
4a61e6a
607c77d
531e0e8
5582882
57b922c
313003e
2ede506
3b61e5b
acd44ae
ba6393f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -186,6 +186,7 @@ def array_strptime( | |
bint iso_format = format_is_iso(fmt) | ||
NPY_DATETIMEUNIT out_bestunit | ||
int out_local = 0, out_tzoffset = 0 | ||
bint string_to_dts_succeeded = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any particular reason this is changed from failed? doesn't really matter to me, just curious There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it just simplifies the logic |
||
|
||
assert is_raise or is_ignore or is_coerce | ||
|
||
|
@@ -306,53 +307,62 @@ def array_strptime( | |
else: | ||
val = str(val) | ||
|
||
if iso_format: | ||
string_to_dts_failed = string_to_dts( | ||
if fmt == "ISO8601": | ||
string_to_dts_succeeded = not string_to_dts( | ||
val, &dts, &out_bestunit, &out_local, | ||
&out_tzoffset, False, None, False | ||
) | ||
elif iso_format: | ||
string_to_dts_succeeded = not string_to_dts( | ||
val, &dts, &out_bestunit, &out_local, | ||
&out_tzoffset, False, fmt, exact | ||
) | ||
if not string_to_dts_failed: | ||
# No error reported by string_to_dts, pick back up | ||
# where we left off | ||
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) | ||
if out_local == 1: | ||
# Store the out_tzoffset in seconds | ||
# since we store the total_seconds of | ||
# dateutil.tz.tzoffset objects | ||
tz = timezone(timedelta(minutes=out_tzoffset)) | ||
result_timezone[i] = tz | ||
out_local = 0 | ||
out_tzoffset = 0 | ||
iresult[i] = value | ||
check_dts_bounds(&dts) | ||
continue | ||
if string_to_dts_succeeded: | ||
# No error reported by string_to_dts, pick back up | ||
# where we left off | ||
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) | ||
if out_local == 1: | ||
# Store the out_tzoffset in seconds | ||
# since we store the total_seconds of | ||
# dateutil.tz.tzoffset objects | ||
tz = timezone(timedelta(minutes=out_tzoffset)) | ||
result_timezone[i] = tz | ||
out_local = 0 | ||
out_tzoffset = 0 | ||
iresult[i] = value | ||
check_dts_bounds(&dts) | ||
continue | ||
|
||
if parse_today_now(val, &iresult[i], utc): | ||
continue | ||
|
||
# Some ISO formats can't be parsed by string_to_dts | ||
# For example, 6-digit YYYYMD. So, if there's an error, | ||
# try the string-matching code below. | ||
# For example, 6-digit YYYYMD. So, if there's an error, and a format | ||
# was specified, then try the string-matching code below. If the format | ||
# specified was 'ISO8601', then we need to error, because | ||
# only string_to_dts handles mixed ISO8601 formats. | ||
if not string_to_dts_succeeded and fmt == "ISO8601": | ||
raise ValueError(f"Time data {val} is not ISO8601 format") | ||
|
||
# exact matching | ||
if exact: | ||
found = format_regex.match(val) | ||
if not found: | ||
raise ValueError(f"time data \"{val}\" doesn't " | ||
f"match format \"{fmt}\"") | ||
raise ValueError( | ||
f"time data \"{val}\" doesn't match format \"{fmt}\"" | ||
) | ||
if len(val) != found.end(): | ||
raise ValueError( | ||
f"unconverted data remains: " | ||
f'"{val[found.end():]}"' | ||
"unconverted data remains when parsing with " | ||
f"format \"{fmt}\": \"{val[found.end():]}\"" | ||
) | ||
|
||
# search | ||
else: | ||
found = format_regex.search(val) | ||
if not found: | ||
raise ValueError( | ||
f"time data \"{val}\" doesn't match " | ||
f"format \"{fmt}\"" | ||
f"time data \"{val}\" doesn't match format \"{fmt}\"" | ||
) | ||
|
||
iso_year = -1 | ||
|
@@ -504,7 +514,14 @@ def array_strptime( | |
result_timezone[i] = tz | ||
|
||
except (ValueError, OutOfBoundsDatetime) as ex: | ||
ex.args = (f"{str(ex)}, at position {i}",) | ||
ex.args = ( | ||
f"{str(ex)}, at position {i}. You might want to try:\n" | ||
" - passing ``format='ISO8601'`` if your strings are " | ||
"all ISO8601 but not necessarily in exactly the same format;\n" | ||
" - passing ``format='mixed'``, and the format will be " | ||
"inferred for each element individually. " | ||
"You might want to use ``dayfirst`` alongside this.", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might still want to mention the suggestion to pass a specific For example, if an error occurs because the format is ambiguous based on the first value (dayfirst/daylast), and was inferred incorrectly:
Here, I think the best is still to provide a manual There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (also, a fomatting nit: since this is plain text (and not rst that will be rendered), no need for double backticks, I would use single backticks which is a bit less visually distracting) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good point, thanks! |
||
) | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -442,10 +442,11 @@ def _convert_listlike_datetimes( | |
|
||
arg = ensure_object(arg) | ||
|
||
if format is None: | ||
if format is None and format != "mixed": | ||
format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) | ||
|
||
if format is not None: | ||
# `format` could not be inferred, or user asked for mixed-format parsing. | ||
if format is not None and format != "mixed": | ||
return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) | ||
|
||
result, tz_parsed = objects_to_datetime64ns( | ||
|
@@ -687,7 +688,7 @@ def to_datetime( | |
yearfirst: bool = False, | ||
utc: bool = False, | ||
format: str | None = None, | ||
mroeschke marked this conversation as resolved.
Show resolved
Hide resolved
|
||
exact: bool = True, | ||
exact: bool | lib.NoDefault = lib.no_default, | ||
unit: str | None = None, | ||
infer_datetime_format: lib.NoDefault | bool = lib.no_default, | ||
origin: str = "unix", | ||
|
@@ -759,13 +760,20 @@ def to_datetime( | |
<https://docs.python.org/3/library/datetime.html | ||
#strftime-and-strptime-behavior>`_ for more information on choices, though | ||
note that :const:`"%f"` will parse all the way up to nanoseconds. | ||
You can also pass: | ||
|
||
- "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_ | ||
time string (not necessarily in exactly the same format); | ||
- "mixed", to infer the format for each element individually. This is risky, | ||
and you should probably use it along with `dayfirst`. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's true that in that case that is best, but the typicaly caveat of that this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. true, but it would solve the typical example at least In [2]: pd.to_datetime(['12-01-2000 00:00:00', '13-01-2000 00:00:00'], format='mixed', dayfirst=True)
Out[2]: DatetimeIndex(['2000-01-12', '2000-01-13'], dtype='datetime64[ns]', freq=None) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I was just wondering if it would be useful to still call it out here explicitly (but it's quite explicit in the |
||
exact : bool, default True | ||
Control how `format` is used: | ||
|
||
- If :const:`True`, require an exact `format` match. | ||
- If :const:`False`, allow the `format` to match anywhere in the target | ||
string. | ||
|
||
Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``. | ||
unit : str, default 'ns' | ||
The unit of the arg (D,s,ms,us,ns) denote the unit, which is an | ||
integer or float number. This will be based off the origin. | ||
|
@@ -997,6 +1005,8 @@ def to_datetime( | |
DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], | ||
dtype='datetime64[ns, UTC]', freq=None) | ||
""" | ||
if exact is not lib.no_default and format in {"mixed", "ISO8601"}: | ||
raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") | ||
if infer_datetime_format is not lib.no_default: | ||
warnings.warn( | ||
"The argument 'infer_datetime_format' is deprecated and will " | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this "preferably", it depends on your data?