Decoding year 1 time (#4506)

* Add test of time passing even if '1-1-1' in units * Pass test * Format * Whatsnew * Shorten name * Update whatsnew with pull info * Clarify comment * Update doc/whats-new.rst Co-authored-by: Spencer Clark <[email protected]> * Add extra note to whatsnew thanks to suggestion from @spencerkclark * Add failing test of warning * Pass test of warning when padding * Cleanup after rebase * Format * Cleanup whatsnew * Apply suggestions from code review by @mathause Co-authored-by: Mathias Hauser <[email protected]> Co-authored-by: Spencer Clark <[email protected]> Co-authored-by: Mathias Hauser <[email protected]> Co-authored-by: Mathias Hauser <[email protected]>
pydata · Oct 26, 2020 · adc55ac · adc55ac
1 parent 79df665
commit adc55ac
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 18 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -36,6 +36,14 @@ New Features
 Bug fixes
 ~~~~~~~~~
 
+- Fix bug where reference times without padded years (e.g. "since 1-1-1") would lose their units when
+  being passed by :py:func:`encode_cf_datetime` (:issue:`4422`, :pull:`4506`). Such units are ambiguous
+  about which digit represents the years (is it YMD or DMY?). Now, if such formatting is encountered,
+  it is assumed that the first digit is the years, they are padded appropriately (to e.g. "since 0001-1-1")
+  and a warning that this assumption is being made is issued. Previously, without ``cftime``, such times
+  would be silently parsed incorrectly (at least based on the CF conventions) e.g. "since 1-1-1" would
+  be parsed (via``pandas`` and ``dateutil``) to "since 2001-1-1".
+  By `Zeb Nicholls <https://github.com/znicholls>`_.
 - Fix :py:meth:`DataArray.plot.step`. By `Deepak Cherian <https://github.com/dcherian>`_.
 - Fix bug where reading a scalar value from a NetCDF file opened with the ``h5netcdf`` backend would raise a ``ValueError`` when ``decode_cf=True`` (:issue:`4471`, :pull:`4485`).
   By `Gerrit Holl <https://github.com/gerritholl>`_.
@@ -81,7 +89,7 @@ v0.16.1 (2020-09-20)
 This patch release fixes an incompatibility with a recent pandas change, which
 was causing an issue indexing with a ``datetime64``. It also includes
 improvements to ``rolling``, ``to_dataframe``, ``cov`` & ``corr`` methods and
-bug fixes. Our documentation has a number of improvements, including fixing all 
+bug fixes. Our documentation has a number of improvements, including fixing all
 doctests and confirming their accuracy on every commit.
 
 Many thanks to the 36 contributors who contributed to this release:
@@ -161,7 +169,7 @@ Bug fixes
   By `Jens Svensmark <https://github.com/jenssss>`_
 - Fix incorrect legend labels for :py:meth:`Dataset.plot.scatter` (:issue:`4126`).
   By `Peter Hausamann <https://github.com/phausamann>`_.
-- Fix ``dask.optimize`` on ``DataArray`` producing an invalid Dask task graph (:issue:`3698`) 
+- Fix ``dask.optimize`` on ``DataArray`` producing an invalid Dask task graph (:issue:`3698`)
   By `Tom Augspurger <https://github.com/TomAugspurger>`_
 - Fix ``pip install .`` when no ``.git`` directory exists; namely when the xarray source
   directory has been rsync'ed by PyCharm Professional for a remote deployment over SSH.

diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -53,14 +53,50 @@ def _netcdf_to_numpy_timeunit(units):
     }[units]
 
 
+def _ensure_padded_year(ref_date):
+    # Reference dates without a padded year (e.g. since 1-1-1 or since 2-3-4)
+    # are ambiguous (is it YMD or DMY?). This can lead to some very odd
+    # behaviour e.g. pandas (via dateutil) passes '1-1-1 00:00:0.0' as
+    # '2001-01-01 00:00:00' (because it assumes a) DMY and b) that year 1 is
+    # shorthand for 2001 (like 02 would be shorthand for year 2002)).
+
+    # Here we ensure that there is always a four-digit year, with the
+    # assumption being that year comes first if we get something ambiguous.
+    matches_year = re.match(r".*\d{4}.*", ref_date)
+    if matches_year:
+        # all good, return
+        return ref_date
+
+    # No four-digit strings, assume the first digits are the year and pad
+    # appropriately
+    matches_start_digits = re.match(r"(\d+)(.*)", ref_date)
+    ref_year, everything_else = [s for s in matches_start_digits.groups()]
+    ref_date_padded = "{:04d}{}".format(int(ref_year), everything_else)
+
+    warning_msg = (
+        f"Ambiguous reference date string: {ref_date}. The first value is "
+        "assumed to be the year hence will be padded with zeros to remove "
+        f"the ambiguity (the padded reference date string is: {ref_date_padded}). "
+        "To remove this message, remove the ambiguity by padding your reference "
+        "date strings with zeros."
+    )
+    warnings.warn(warning_msg, SerializationWarning)
+
+    return ref_date_padded
+
+
 def _unpack_netcdf_time_units(units):
     # CF datetime units follow the format: "UNIT since DATE"
     # this parses out the unit and date allowing for extraneous
-    # whitespace.
-    matches = re.match("(.+) since (.+)", units)
+    # whitespace. It also ensures that the year is padded with zeros
+    # so it will be correctly understood by pandas (via dateutil).
+    matches = re.match(r"(.+) since (.+)", units)
     if not matches:
-        raise ValueError("invalid time units: %s" % units)
+        raise ValueError(f"invalid time units: {units}")
+
     delta_units, ref_date = [s.strip() for s in matches.groups()]
+    ref_date = _ensure_padded_year(ref_date)
+
     return delta_units, ref_date
 
 

diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py
@@ -54,6 +54,7 @@
     ([[0]], "days since 1000-01-01"),
     (np.arange(2), "days since 1000-01-01"),
     (np.arange(0, 100000, 20000), "days since 1900-01-01"),
+    (np.arange(0, 100000, 20000), "days since 1-01-01"),
     (17093352.0, "hours since 1-1-1 00:00:0.0"),
     ([0.5, 1.5], "hours since 1900-01-01T00:00:00"),
     (0, "milliseconds since 2000-01-01T00:00:00"),
@@ -109,20 +110,16 @@ def test_cf_datetime(num_dates, units, calendar):
     # https://github.com/Unidata/netcdf4-python/issues/355
     assert (abs_diff <= np.timedelta64(1, "s")).all()
     encoded, _, _ = coding.times.encode_cf_datetime(actual, units, calendar)
-    if "1-1-1" not in units:
-        # pandas parses this date very strangely, so the original
-        # units/encoding cannot be preserved in this case:
-        # (Pdb) pd.to_datetime('1-1-1 00:00:0.0')
-        # Timestamp('2001-01-01 00:00:00')
+
+    assert_array_equal(num_dates, np.around(encoded, 1))
+    if hasattr(num_dates, "ndim") and num_dates.ndim == 1 and "1000" not in units:
+        # verify that wrapping with a pandas.Index works
+        # note that it *does not* currently work to put
+        # non-datetime64 compatible dates into a pandas.Index
+        encoded, _, _ = coding.times.encode_cf_datetime(
+            pd.Index(actual), units, calendar
+        )
         assert_array_equal(num_dates, np.around(encoded, 1))
-        if hasattr(num_dates, "ndim") and num_dates.ndim == 1 and "1000" not in units:
-            # verify that wrapping with a pandas.Index works
-            # note that it *does not* currently work to even put
-            # non-datetime64 compatible dates into a pandas.Index
-            encoded, _, _ = coding.times.encode_cf_datetime(
-                pd.Index(actual), units, calendar
-            )
-            assert_array_equal(num_dates, np.around(encoded, 1))
 
 
 @requires_cftime
@@ -928,3 +925,35 @@ def test_use_cftime_false_non_standard_calendar(calendar, units_year):
     units = f"days since {units_year}-01-01"
     with pytest.raises(OutOfBoundsDatetime):
         decode_cf_datetime(numerical_dates, units, calendar, use_cftime=False)
+
+
+@requires_cftime
+@pytest.mark.parametrize("calendar", _ALL_CALENDARS)
+def test_decode_ambiguous_time_warns(calendar):
+    # GH 4422, 4506
+    from cftime import num2date
+
+    # we don't decode non-standard calendards with
+    # pandas so expect no warning will be emitted
+    is_standard_calendar = calendar in coding.times._STANDARD_CALENDARS
+
+    dates = [1, 2, 3]
+    units = "days since 1-1-1"
+    expected = num2date(dates, units, calendar=calendar, only_use_cftime_datetimes=True)
+
+    exp_warn_type = SerializationWarning if is_standard_calendar else None
+
+    with pytest.warns(exp_warn_type) as record:
+        result = decode_cf_datetime(dates, units, calendar=calendar)
+
+    if is_standard_calendar:
+        relevant_warnings = [
+            r
+            for r in record.list
+            if str(r.message).startswith("Ambiguous reference date string: 1-1-1")
+        ]
+        assert len(relevant_warnings) == 1
+    else:
+        assert not record
+
+    np.testing.assert_array_equal(result, expected)