Datetime parsing (PDEP-4): allow mixture of ISO formatted strings (#5…

…0939) * allow format iso8601 * fixup tests * 🏷️ typing * remove duplicate code * improve message, use if-statement * note that exact has no effect if format=iso8601 * point to format=ISO8601 in error message * allow format="mixed" * link to iso wiki page * minor fixups * double backticks -> single, suggest passing format * use format=mixed instead of apply in example; --------- Co-authored-by: MarcoGorelli <> Co-authored-by: Matthew Roeschke <[email protected]>
pandas-dev · Feb 14, 2023 · d1095bc · d1095bc
1 parent f3d4113
commit d1095bc
Show file tree

Hide file tree

Showing 7 changed files with 186 additions and 70 deletions.
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1001,14 +1001,23 @@ way to parse dates is to explicitly set ``format=``.
    )
    df
 
-In the case that you have mixed datetime formats within the same column, you'll need to
-first read it in as an object dtype and then apply :func:`to_datetime` to each element.
+In the case that you have mixed datetime formats within the same column, you can
+pass  ``format='mixed'``
 
 .. ipython:: python
 
    data = io.StringIO("date\n12 Jan 2000\n2000-01-13\n")
    df = pd.read_csv(data)
-   df['date'] = df['date'].apply(pd.to_datetime)
+   df['date'] = pd.to_datetime(df['date'], format='mixed')
+   df
+
+or, if your datetime formats are all ISO8601 (possibly not identically-formatted):
+
+.. ipython:: python
+
+   data = io.StringIO("date\n2020-01-01\n2020-01-01 03:00\n")
+   df = pd.read_csv(data)
+   df['date'] = pd.to_datetime(df['date'], format='ISO8601')
    df
 
 .. ipython:: python

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -311,6 +311,8 @@ Other enhancements
 - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`)
 - Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`)
 - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`)
+- :func:`to_datetime` now accepts ``"ISO8601"`` as an argument to ``format``, which will match any ISO8601 string (but possibly not identically-formatted) (:issue:`50411`)
+- :func:`to_datetime` now accepts ``"mixed"`` as an argument to ``format``, which will infer the format for each element individually (:issue:`50972`)
 - Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`)
 - Added support for SQLAlchemy 2.0 (:issue:`40686`)
 - :class:`Index` set operations :meth:`Index.union`, :meth:`Index.intersection`, :meth:`Index.difference`, and :meth:`Index.symmetric_difference` now support ``sort=True``, which will always return a sorted result, unlike the default ``sort=None`` which does not sort in some cases (:issue:`25151`)
@@ -738,11 +740,16 @@ In the past, :func:`to_datetime` guessed the format for each element independent
 
 Note that this affects :func:`read_csv` as well.
 
-If you still need to parse dates with inconsistent formats, you'll need to apply :func:`to_datetime`
-to each element individually, e.g. ::
+If you still need to parse dates with inconsistent formats, you can use
+``format='mixed`` (possibly alongside ``dayfirst``) ::
 
      ser = pd.Series(['13-01-2000', '12 January 2000'])
-     ser.apply(pd.to_datetime)
+     pd.to_datetime(ser, format='mixed', dayfirst=True)
+
+or, if your formats are all ISO8601 (but possibly not identically-formatted) ::
+
+     ser = pd.Series(['2020-01-01', '2020-01-01 03:00'])
+     pd.to_datetime(ser, format='ISO8601')
 
 .. _whatsnew_200.api_breaking.other:
 

diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
@@ -186,6 +186,7 @@ def array_strptime(
         bint iso_format = format_is_iso(fmt)
         NPY_DATETIMEUNIT out_bestunit
         int out_local = 0, out_tzoffset = 0
+        bint string_to_dts_succeeded = 0
 
     assert is_raise or is_ignore or is_coerce
 
@@ -306,53 +307,62 @@ def array_strptime(
             else:
                 val = str(val)
 
-            if iso_format:
-                string_to_dts_failed = string_to_dts(
+            if fmt == "ISO8601":
+                string_to_dts_succeeded = not string_to_dts(
+                    val, &dts, &out_bestunit, &out_local,
+                    &out_tzoffset, False, None, False
+                )
+            elif iso_format:
+                string_to_dts_succeeded = not string_to_dts(
                     val, &dts, &out_bestunit, &out_local,
                     &out_tzoffset, False, fmt, exact
                 )
-                if not string_to_dts_failed:
-                    # No error reported by string_to_dts, pick back up
-                    # where we left off
-                    value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
-                    if out_local == 1:
-                        # Store the out_tzoffset in seconds
-                        # since we store the total_seconds of
-                        # dateutil.tz.tzoffset objects
-                        tz = timezone(timedelta(minutes=out_tzoffset))
-                        result_timezone[i] = tz
-                        out_local = 0
-                        out_tzoffset = 0
-                    iresult[i] = value
-                    check_dts_bounds(&dts)
-                    continue
+            if string_to_dts_succeeded:
+                # No error reported by string_to_dts, pick back up
+                # where we left off
+                value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
+                if out_local == 1:
+                    # Store the out_tzoffset in seconds
+                    # since we store the total_seconds of
+                    # dateutil.tz.tzoffset objects
+                    tz = timezone(timedelta(minutes=out_tzoffset))
+                    result_timezone[i] = tz
+                    out_local = 0
+                    out_tzoffset = 0
+                iresult[i] = value
+                check_dts_bounds(&dts)
+                continue
 
             if parse_today_now(val, &iresult[i], utc):
                 continue
 
             # Some ISO formats can't be parsed by string_to_dts
-            # For example, 6-digit YYYYMD. So, if there's an error,
-            # try the string-matching code below.
+            # For example, 6-digit YYYYMD. So, if there's an error, and a format
+            # was specified, then try the string-matching code below. If the format
+            # specified was 'ISO8601', then we need to error, because
+            # only string_to_dts handles mixed ISO8601 formats.
+            if not string_to_dts_succeeded and fmt == "ISO8601":
+                raise ValueError(f"Time data {val} is not ISO8601 format")
 
             # exact matching
             if exact:
                 found = format_regex.match(val)
                 if not found:
-                    raise ValueError(f"time data \"{val}\" doesn't "
-                                     f"match format \"{fmt}\"")
+                    raise ValueError(
+                        f"time data \"{val}\" doesn't match format \"{fmt}\""
+                    )
                 if len(val) != found.end():
                     raise ValueError(
-                        f"unconverted data remains: "
-                        f'"{val[found.end():]}"'
+                        "unconverted data remains when parsing with "
+                        f"format \"{fmt}\": \"{val[found.end():]}\""
                     )
 
             # search
             else:
                 found = format_regex.search(val)
                 if not found:
                     raise ValueError(
-                        f"time data \"{val}\" doesn't match "
-                        f"format \"{fmt}\""
+                        f"time data \"{val}\" doesn't match format \"{fmt}\""
                     )
 
             iso_year = -1
@@ -504,7 +514,15 @@ def array_strptime(
             result_timezone[i] = tz
 
         except (ValueError, OutOfBoundsDatetime) as ex:
-            ex.args = (f"{str(ex)}, at position {i}",)
+            ex.args = (
+                f"{str(ex)}, at position {i}. You might want to try:\n"
+                "    - passing `format` if your strings have a consistent format;\n"
+                "    - passing `format='ISO8601'` if your strings are "
+                "all ISO8601 but not necessarily in exactly the same format;\n"
+                "    - passing `format='mixed'`, and the format will be "
+                "inferred for each element individually. "
+                "You might want to use `dayfirst` alongside this.",
+            )
             if is_coerce:
                 iresult[i] = NPY_NAT
                 continue

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -445,7 +445,8 @@ def _convert_listlike_datetimes(
     if format is None:
         format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
 
-    if format is not None:
+    # `format` could be inferred, or user didn't ask for mixed-format parsing.
+    if format is not None and format != "mixed":
         return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
 
     result, tz_parsed = objects_to_datetime64ns(
@@ -687,7 +688,7 @@ def to_datetime(
     yearfirst: bool = False,
     utc: bool = False,
     format: str | None = None,
-    exact: bool = True,
+    exact: bool | lib.NoDefault = lib.no_default,
     unit: str | None = None,
     infer_datetime_format: lib.NoDefault | bool = lib.no_default,
     origin: str = "unix",
@@ -717,9 +718,7 @@ def to_datetime(
         .. warning::
 
             ``dayfirst=True`` is not strict, but will prefer to parse
-            with day first. If a delimited date string cannot be parsed in
-            accordance with the given `dayfirst` option, e.g.
-            ``to_datetime(['31-12-2021'])``, then a warning will be shown.
+            with day first.
 
     yearfirst : bool, default False
         Specify a date parse order if `arg` is str or is list-like.
@@ -759,13 +758,20 @@ def to_datetime(
         <https://docs.python.org/3/library/datetime.html
         #strftime-and-strptime-behavior>`_ for more information on choices, though
         note that :const:`"%f"` will parse all the way up to nanoseconds.
+        You can also pass:
+
+        - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
+          time string (not necessarily in exactly the same format);
+        - "mixed", to infer the format for each element individually. This is risky,
+          and you should probably use it along with `dayfirst`.
     exact : bool, default True
         Control how `format` is used:
 
         - If :const:`True`, require an exact `format` match.
         - If :const:`False`, allow the `format` to match anywhere in the target
           string.
 
+        Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``.
     unit : str, default 'ns'
         The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
         integer or float number. This will be based off the origin.
@@ -997,6 +1003,8 @@ def to_datetime(
     DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'],
                   dtype='datetime64[ns, UTC]', freq=None)
     """
+    if exact is not lib.no_default and format in {"mixed", "ISO8601"}:
+        raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'")
     if infer_datetime_format is not lib.no_default:
         warnings.warn(
             "The argument 'infer_datetime_format' is deprecated and will "

diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -1721,7 +1721,8 @@ def test_parse_multiple_delimited_dates_with_swap_warnings():
     with pytest.raises(
         ValueError,
         match=(
-            r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", at position 1$'
+            r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", '
+            r"at position 1. You might want to try:"
         ),
     ):
         pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"])