pandas-dev · MarcoGorelli · Feb 14, 2023 · Jan 23, 2023 · Jan 23, 2023 · Jan 23, 2023
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1011,6 +1011,15 @@ first read it in as an object dtype and then apply :func:`to_datetime` to each e
    df['date'] = df['date'].apply(pd.to_datetime)
    df
 
+or, if your datetime formats are all ISO8601:
+
+.. ipython:: python
+
+   data = io.StringIO("date\n2020-01-01\n2020-01-01 03:00\n")
+   df = pd.read_csv(data)
+   df['date'] = pd.to_datetime(df['date'], format='ISO8601')
+   df
+
 .. ipython:: python
    :suppress:
 

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -168,6 +168,7 @@ Other enhancements
 - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`)
 - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`)
 - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`)
+- :func:`to_datetime` now accepts ``"ISO8601"`` as an argument to ``format``, which will match any ISO8601 string (:issue:`50411`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -559,6 +560,11 @@ to each element individually, e.g. ::
      ser = pd.Series(['13-01-2000', '12 January 2000'])
      ser.apply(pd.to_datetime)
 
+or, if your formats are all ISO8601, ::
+
+     ser = pd.Series(['2020-01-01', '2020-01-01 03:00'])
+     pd.to_datetime(ser, format='ISO8601')
+
 .. _whatsnew_200.api_breaking.other:
 
 Other API changes

diff --git a/pandas/_libs/tslibs/strptime.pyi b/pandas/_libs/tslibs/strptime.pyi
@@ -5,6 +5,7 @@ from pandas._typing import npt
 def array_strptime(
     values: npt.NDArray[np.object_],
     fmt: str | None,
+    fmt_inferred: bool = ...,
     exact: bool = ...,
     errors: str = ...,
     utc: bool = ...,

@@ -152,6 +152,7 @@ cdef dict _parse_code_table = {"y": 0,
 def array_strptime(
     ndarray[object] values,
     str fmt,
+    bint fmt_inferred=False,
     bint exact=True,
     errors="raise",
     bint utc=False,
@@ -186,6 +187,7 @@ def array_strptime(
         bint iso_format = format_is_iso(fmt)
         NPY_DATETIMEUNIT out_bestunit
         int out_local = 0, out_tzoffset = 0
+        bint string_to_dts_succeeded = 0
 
     assert is_raise or is_ignore or is_coerce
 
@@ -306,43 +308,55 @@ def array_strptime(
             else:
                 val = str(val)
 
-            if iso_format:
-                string_to_dts_failed = string_to_dts(
+            if fmt == "ISO8601":
+                string_to_dts_succeeded = not string_to_dts(
+                    val, &dts, &out_bestunit, &out_local,
+                    &out_tzoffset, False, None, False
+                )
+            elif iso_format:
+                string_to_dts_succeeded = not string_to_dts(
                     val, &dts, &out_bestunit, &out_local,
                     &out_tzoffset, False, fmt, exact
                 )
-                if not string_to_dts_failed:
-                    # No error reported by string_to_dts, pick back up
-                    # where we left off
-                    value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
-                    if out_local == 1:
-                        # Store the out_tzoffset in seconds
-                        # since we store the total_seconds of
-                        # dateutil.tz.tzoffset objects
-                        tz = timezone(timedelta(minutes=out_tzoffset))
-                        result_timezone[i] = tz
-                        out_local = 0
-                        out_tzoffset = 0
-                    iresult[i] = value
-                    check_dts_bounds(&dts)
-                    continue
+            if string_to_dts_succeeded:
+                # No error reported by string_to_dts, pick back up
+                # where we left off
+                value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
+                if out_local == 1:
+                    # Store the out_tzoffset in seconds
+                    # since we store the total_seconds of
+                    # dateutil.tz.tzoffset objects
+                    tz = timezone(timedelta(minutes=out_tzoffset))
+                    result_timezone[i] = tz
+                    out_local = 0
+                    out_tzoffset = 0
+                iresult[i] = value
+                check_dts_bounds(&dts)
+                continue
 
             if parse_today_now(val, &iresult[i], utc):
                 continue
 
             # Some ISO formats can't be parsed by string_to_dts
-            # For example, 6-digit YYYYMD. So, if there's an error,
-            # try the string-matching code below.
+            # For example, 6-digit YYYYMD. So, if there's an error, and a format
+            # was specified, then try the string-matching code below. If the format
+            # specified was 'ISO8601', then we need to error, because
+            # only string_to_dts handles mixed ISO8601 formats.
+            if not string_to_dts_succeeded and fmt == "ISO8601":
+                raise ValueError(f"Time data {val} is not ISO8601 format")
 
             # exact matching
             if exact:
                 found = format_regex.match(val)
                 if not found:
-                    raise ValueError(f"time data \"{val}\" doesn't "
-                                     f"match format \"{fmt}\"")
+                    raise ValueError(
+                        f"time data \"{val}\" doesn't "
+                        f"match {'(inferred) '*fmt_inferred}format \"{fmt}\""
+                    )
                 if len(val) != found.end():
                     raise ValueError(
-                        f"unconverted data remains: "
+                        "unconverted data remains when parsing with "
+                        f"{'(inferred) '*fmt_inferred}format \"{fmt}\": "
                         f'"{val[found.end():]}"'
                     )
 
@@ -352,7 +366,7 @@ def array_strptime(
                 if not found:
                     raise ValueError(
                         f"time data \"{val}\" doesn't match "
-                        f"format \"{fmt}\""
+                        f"{'(inferred) '*fmt_inferred}format \"{fmt}\""
                     )
 
             iso_year = -1

@@ -442,11 +442,15 @@ def _convert_listlike_datetimes(
 
     arg = ensure_object(arg)
 
+    format_inferred = False
     if format is None:
         format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
+        format_inferred = True
 
     if format is not None:
-        return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
+        return _array_strptime_with_fallback(
+            arg, name, utc, format, format_inferred, exact, errors
+        )
 
     result, tz_parsed = objects_to_datetime64ns(
         arg,
@@ -471,13 +475,16 @@ def _array_strptime_with_fallback(
     name,
     utc: bool,
     fmt: str,
+    fmt_inferred: bool,
     exact: bool,
     errors: str,
 ) -> Index:
     """
     Call array_strptime, with fallback behavior depending on 'errors'.
     """
-    result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc)
+    result, timezones = array_strptime(
+        arg, fmt, fmt_inferred=fmt_inferred, exact=exact, errors=errors, utc=utc
+    )
     if any(tz is not None for tz in timezones):
         return _return_parsed_timezone_results(result, timezones, utc, name)
 
@@ -759,6 +766,7 @@ def to_datetime(
         <https://docs.python.org/3/library/datetime.html
         #strftime-and-strptime-behavior>`_ for more information on choices, though
         note that :const:`"%f"` will parse all the way up to nanoseconds.
+        You can also pass "ISO8601" to parse any ISO8601 time string.
     exact : bool, default True
         Control how `format` is used:
 

diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -1719,7 +1719,8 @@ def test_parse_multiple_delimited_dates_with_swap_warnings():
     with pytest.raises(
         ValueError,
         match=(
-            r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", at position 1$'
+            r'^time data "31/05/2000" doesn\'t match \(inferred\) format "%m/%d/%Y", '
+            r"at position 1$"
         ),
     ):
         pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"])

diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
@@ -133,7 +133,11 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache):
         ser2 = ser.apply(str)
         ser2[2] = "nat"
         with pytest.raises(
-            ValueError, match='unconverted data remains: ".0", at position 0'
+            ValueError,
+            match=(
+                'unconverted data remains when parsing with format "%Y%m%d": ".0", '
+                "at position 0"
+            ),
         ):
             # https://github.com/pandas-dev/pandas/issues/50051
             to_datetime(ser2, format="%Y%m%d", cache=cache)
@@ -528,7 +532,8 @@ def test_to_datetime_parse_timezone_malformed(self, offset):
         msg = "|".join(
             [
                 r'^time data ".*" doesn\'t match format ".*", at position 0$',
-                r'^unconverted data remains: ".*", at position 0$',
+                r'^unconverted data remains when parsing with format ".*": ".*", '
+                "at position 0$",
             ]
         )
         with pytest.raises(ValueError, match=msg):
@@ -1288,7 +1293,10 @@ def test_datetime_bool_arrays_mixed(self, cache):
             to_datetime([False, datetime.today()], cache=cache)
         with pytest.raises(
             ValueError,
-            match=r'^time data "True" doesn\'t match format "%Y%m%d", at position 1$',
+            match=(
+                r'^time data "True" doesn\'t match \(inferred\) format "%Y%m%d", '
+                "at position 1$"
+            ),
         ):
             to_datetime(["20130101", True], cache=cache)
         tm.assert_index_equal(
@@ -1331,7 +1339,8 @@ def test_datetime_invalid_scalar(self, value, format, warning):
             [
                 r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0$',
                 r'^Given date string "a" not likely a datetime, at position 0$',
-                r'^unconverted data remains: "9", at position 0$',
+                r'^unconverted data remains when parsing with format "%H:%M:%S": "9", '
+                "at position 0$",
                 r"^second must be in 0..59: 00:01:99, at position 0$",
             ]
         )
@@ -1382,7 +1391,8 @@ def test_datetime_invalid_index(self, values, format, warning):
             [
                 r'^Given date string "a" not likely a datetime, at position 0$',
                 r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0$',
-                r'^unconverted data remains: "9", at position 0$',
+                r'^unconverted data remains when parsing with format "%H:%M:%S": "9", '
+                "at position 0$",
                 r"^second must be in 0..59: 00:01:99, at position 0$",
             ]
         )
@@ -2143,8 +2153,8 @@ def test_dataframe_float(self, cache):
         # float
         df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]})
         msg = (
-            r"^cannot assemble the datetimes: unconverted data remains: "
-            r'"1", at position 0$'
+            r"^cannot assemble the datetimes: unconverted data remains when parsing "
+            r'with format ".*": "1", at position 0$'
         )
         with pytest.raises(ValueError, match=msg):
             to_datetime(df, cache=cache)
@@ -2226,7 +2236,8 @@ def test_to_datetime_iso8601_exact_fails(self, input, format):
         # `format` is shorter than the date string, so only fails with `exact=True`
         msg = "|".join(
             [
-                '^unconverted data remains: ".*", at position 0$',
+                '^unconverted data remains when parsing with format ".*": ".*"'
+                ", at position 0$",
                 'time data ".*" doesn\'t match format ".*", at position 0',
             ]
         )
@@ -2360,7 +2371,10 @@ def test_to_datetime_on_datetime64_series(self, cache):
     def test_to_datetime_with_space_in_series(self, cache):
         # GH 6428
         ser = Series(["10/18/2006", "10/18/2008", " "])
-        msg = r'^time data " " doesn\'t match format "%m/%d/%Y", at position 2$'
+        msg = (
+            r'^time data " " doesn\'t match \(inferred\) format "%m/%d/%Y", '
+            "at position 2$"
+        )
         with pytest.raises(ValueError, match=msg):
             to_datetime(ser, errors="raise", cache=cache)
         result_coerce = to_datetime(ser, errors="coerce", cache=cache)
@@ -2624,7 +2638,7 @@ def test_dayfirst_warnings_invalid_input(self):
         with pytest.raises(
             ValueError,
             match=(
-                r'^time data "03/30/2011" doesn\'t match format '
+                r'^time data "03/30/2011" doesn\'t match \(inferred\) format '
                 r'"%d/%m/%Y", at position 1$'
             ),
         ):
@@ -2695,7 +2709,7 @@ def test_to_datetime_inconsistent_format(self, cache):
         data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"]
         ser = Series(np.array(data))
         msg = (
-            r'^time data "01-02-2011 00:00:00" doesn\'t match format '
+            r'^time data "01-02-2011 00:00:00" doesn\'t match \(inferred\) format '
             r'"%m/%d/%Y %H:%M:%S", at position 1$'
         )
         with pytest.raises(ValueError, match=msg):
@@ -2835,7 +2849,8 @@ def test_day_not_in_month_raise(self, cache):
             (
                 "2015-02-32",
                 "%Y-%m-%d",
-                '^unconverted data remains: "2", at position 0$',
+                '^unconverted data remains when parsing with format "%Y-%m-%d": "2", '
+                "at position 0$",
             ),
             (
                 "2015-32-02",
@@ -3490,3 +3505,31 @@ def test_to_datetime_format_f_parse_nanos():
         nanosecond=789,
     )
     assert result == expected
+
+
+def test_to_datetime_mixed_iso8601():
+    # https://github.com/pandas-dev/pandas/issues/50411
+    result = to_datetime(["2020-01-01", "2020-01-01 05:00:00"], format="ISO8601")
+    expected = DatetimeIndex(["2020-01-01 00:00:00", "2020-01-01 05:00:00"])
+    tm.assert_index_equal(result, expected)
+
+
+def test_to_datetime_mixed_not_necessarily_iso8601_raise():
+    # https://github.com/pandas-dev/pandas/issues/50411
+    with pytest.raises(
+        ValueError, match="Time data 01-01-2000 is not ISO8601 format, at position 1"
+    ):
+        to_datetime(["2020-01-01", "01-01-2000"], format="ISO8601")
+
+
+@pytest.mark.parametrize(
+    ("errors", "expected"),
+    [
+        ("coerce", DatetimeIndex(["2020-01-01 00:00:00", NaT])),
+        ("ignore", Index(["2020-01-01", "01-01-2000"])),
+    ],
+)
+def test_to_datetime_mixed_not_necessarily_iso8601_coerce(errors, expected):
+    # https://github.com/pandas-dev/pandas/issues/50411
+    result = to_datetime(["2020-01-01", "01-01-2000"], format="ISO8601", errors=errors)
+    tm.assert_index_equal(result, expected)
diff --git a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md
@@ -4,7 +4,7 @@
 - Status: Accepted
 - Discussion: [#48621](https://github.com/pandas-dev/pandas/pull/48621)
 - Author: [Marco Gorelli](https://github.com/MarcoGorelli)
-- Revision: 1
+- Revision: 2
 
 ## Abstract
 
@@ -64,6 +64,11 @@ Out[3]:
 1   2000-01-13
 dtype: datetime64[ns]
 ```
+or, if their dates are all ISO8601,
+```ipython
+In [4]: pd.to_datetime(['2020-01-01', '2020-01-01 03:00'], format='ISO8601')
+Out[4]: DatetimeIndex(['2020-01-01 00:00:00', '2020-01-01 03:00:00'], dtype='datetime64[ns]', freq=None)
+```
 
 ## Usage and Impact
 
@@ -99,3 +104,4 @@ We could make ``guess_datetime_format`` smarter by using a random sample of elem
 ### PDEP History
 
 - 18 September 2022: Initial draft
+- 23 January 2023: Amended to mention ``format='ISO8601'`` option