diff --git a/doc/internals/index.rst b/doc/internals/index.rst index b2a37900338..4c00376a7b4 100644 --- a/doc/internals/index.rst +++ b/doc/internals/index.rst @@ -26,3 +26,4 @@ The pages in this section are intended for: how-to-add-new-backend how-to-create-custom-index zarr-encoding-spec + time-coding diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst new file mode 100644 index 00000000000..a7e0d5de23d --- /dev/null +++ b/doc/internals/time-coding.rst @@ -0,0 +1,475 @@ +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + import xarray as xr + + np.random.seed(123456) + np.set_printoptions(threshold=20) + int64_max = np.iinfo("int64").max + int64_min = np.iinfo("int64").min + 1 + uint64_max = np.iinfo("uint64").max + +.. _internals.timecoding: + +Time Coding +=========== + +This page gives an overview how xarray encodes and decodes times and which conventions and functions are used. + +Pandas functionality +-------------------- + +to_datetime +~~~~~~~~~~~ + +The function :py:func:`pandas.to_datetime` is used within xarray for inferring units and for testing purposes. + +In normal operation :py:func:`pandas.to_datetime` returns a :py:class:`pandas.Timestamp` (for scalar input) or :py:class:`pandas.DatetimeIndex` (for array-like input) which are related to ``np.datetime64`` values with a resolution inherited from the input (can be one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``). If no resolution can be inherited ``'ns'`` is assumed. That has the implication that the maximum usable time range for those cases is approximately +/- 292 years centered around the Unix epoch (1970-01-01). To accommodate that, we carefully check the units/resolution in the encoding and decoding step. + +When the arguments are numeric (not strings or ``np.datetime64`` values) ``"unit"`` can be anything from ``'Y'``, ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``, though the returned resolution will be ``"ns"``. + +.. ipython:: python + + f"Minimum datetime: {pd.to_datetime(int64_min, unit="ns")}" + f"Maximum datetime: {pd.to_datetime(int64_max, unit="ns")}" + +For input values which can't be represented in nanosecond resolution an :py:class:`pandas.OutOfBoundsDatetime` exception is raised: + +.. ipython:: python + + try: + dtime = pd.to_datetime(int64_max, unit="us") + except Exception as err: + print(err) + try: + dtime = pd.to_datetime(uint64_max, unit="ns") + print("Wrong:", dtime) + dtime = pd.to_datetime([uint64_max], unit="ns") + except Exception as err: + print(err) + +``np.datetime64`` values can be extracted with :py:meth:`pandas.Timestamp.to_numpy` and :py:meth:`pandas.DatetimeIndex.to_numpy`. The returned resolution depends on the internal representation. This representation can be changed using :py:meth:`pandas.Timestamp.as_unit` +and :py:meth:`pandas.DatetimeIndex.as_unit` respectively. + + +``as_unit`` takes one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` as an argument. That means we are able to represent datetimes with second, millisecond, microsecond or nanosecond resolution. + +.. ipython:: python + + time = pd.to_datetime(np.datetime64(0, "D")) + print("Datetime:", time, np.asarray([time.to_numpy()]).dtype) + print("Datetime as_unit('ms'):", time.as_unit("ms")) + print("Datetime to_numpy():", time.as_unit("ms").to_numpy()) + time = pd.to_datetime(np.array([-1000, 1, 2], dtype="datetime64[Y]")) + print("DatetimeIndex:", time) + print("DatetimeIndex as_unit('us'):", time.as_unit("us")) + print("DatetimeIndex to_numpy():", time.as_unit("us").to_numpy()) + +.. warning:: + Input data with resolution higher than ``'ns'`` (eg. ``'ps'``, ``'fs'``, ``'as'``) is truncated (not rounded) at the ``'ns'``-level. This is `currently broken `_ for the ``'ps'`` input, where it is interpreted as ``'ns'``. + + .. ipython:: python + + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "as")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "fs")])) + print(" Bad:", pd.to_datetime([np.datetime64(1901901901901, "ps")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "ns")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "us")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "ms")])) + +.. warning:: + Care has to be taken, as some configurations of input data will raise. The following shows, that we are safe to use :py:func:`pandas.to_datetime` when providing :py:class:`numpy.datetime64` as scalar or numpy array as input. + + .. ipython:: python + + print( + "Works:", + np.datetime64(1901901901901, "s"), + pd.to_datetime(np.datetime64(1901901901901, "s")), + ) + print( + "Works:", + np.array([np.datetime64(1901901901901, "s")]), + pd.to_datetime(np.array([np.datetime64(1901901901901, "s")])), + ) + try: + pd.to_datetime([np.datetime64(1901901901901, "s")]) + except Exception as err: + print("Raises:", err) + try: + pd.to_datetime(1901901901901, unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_datetime([1901901901901], unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_datetime(np.array([1901901901901]), unit="s") + except Exception as err: + print("Raises:", err) + + +to_timedelta +~~~~~~~~~~~~ + +The function :py:func:`pandas.to_timedelta` is used within xarray for inferring units and for testing purposes. + +In normal operation :py:func:`pandas.to_timedelta` returns a :py:class:`pandas.Timedelta` (for scalar input) or :py:class:`pandas.TimedeltaIndex` (for array-like input) which are ``np.timedelta64`` values with ``ns`` resolution internally. That has the implication, that the usable timedelta covers only roughly 585 years. To accommodate for that, we are working around that limitation in the encoding and decoding step. + +.. ipython:: python + + f"Maximum timedelta range: ({pd.to_timedelta(int64_min, unit="ns")}, {pd.to_timedelta(int64_max, unit="ns")})" + +For input values which can't be represented in nanosecond resolution an :py:class:`pandas.OutOfBoundsTimedelta` exception is raised: + +.. ipython:: python + + try: + delta = pd.to_timedelta(int64_max, unit="us") + except Exception as err: + print("First:", err) + try: + delta = pd.to_timedelta(uint64_max, unit="ns") + except Exception as err: + print("Second:", err) + +When arguments are numeric (not strings or ``np.timedelta64`` values) "unit" can be anything from ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``, though the returned resolution will be ``"ns"``. + +``np.timedelta64`` values can be extracted with :py:meth:`pandas.Timedelta.to_numpy` and :py:meth:`pandas.TimedeltaIndex.to_numpy`. The returned resolution depends on the internal representation. This representation can be changed using :py:meth:`pandas.Timedelta.as_unit` +and :py:meth:`pandas.TimedeltaIndex.as_unit` respectively. + +``as_unit`` takes one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` as an argument. That means we are able to represent timedeltas with second, millisecond, microsecond or nanosecond resolution. + +.. ipython:: python + + delta = pd.to_timedelta(np.timedelta64(1, "D")) + print("Timedelta:", delta, np.asarray([delta.to_numpy()]).dtype) + print("Timedelta as_unit('ms'):", delta.as_unit("ms")) + print("Timedelta to_numpy():", delta.as_unit("ms").to_numpy()) + delta = pd.to_timedelta([0, 1, 2], unit="D") + print("TimedeltaIndex:", delta) + print("TimedeltaIndex as_unit('ms'):", delta.as_unit("ms")) + print("TimedeltaIndex to_numpy():", delta.as_unit("ms").to_numpy()) + +.. warning:: + Care has to be taken, as some configurations of input data will raise. The following shows, that we are safe to use :py:func:`pandas.to_timedelta` when providing :py:class:`numpy.timedelta64` as scalar or numpy array as input. + + .. ipython:: python + + print( + "Works:", + np.timedelta64(1901901901901, "s"), + pd.to_timedelta(np.timedelta64(1901901901901, "s")), + ) + print( + "Works:", + np.array([np.timedelta64(1901901901901, "s")]), + pd.to_timedelta(np.array([np.timedelta64(1901901901901, "s")])), + ) + try: + pd.to_timedelta([np.timedelta64(1901901901901, "s")]) + except Exception as err: + print("Raises:", err) + try: + pd.to_timedelta(1901901901901, unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_timedelta([1901901901901], unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_timedelta(np.array([1901901901901]), unit="s") + except Exception as err: + print("Raises:", err) + +Timestamp +~~~~~~~~~ + +:py:class:`pandas.Timestamp` is used within xarray to wrap strings of CF encoding reference times and datetime.datetime. + +When arguments are numeric (not strings) "unit" can be anything from ``'Y'``, ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``, though the returned resolution will be ``"ns"``. + +In normal operation :py:class:`pandas.Timestamp` holds the timestamp in the provided resolution, but only one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``. Lower resolution input is automatically converted to ``'s'``, higher resolution input is cutted to ``'ns'``. + +The same conversion rules apply here as for :py:func:`pandas.to_timedelta` (see `to_timedelta`_). +Depending on the internal resolution Timestamps can be represented in the range: + +.. ipython:: python + + for unit in ["s", "ms", "us", "ns"]: + print( + f"unit: {unit!r} time range ({pd.Timestamp(int64_min, unit=unit)}, {pd.Timestamp(int64_max, unit=unit)})" + ) + +Since relaxing the resolution, this enhances the range to several hundreds of thousands of centuries with microsecond representation. ``NaT`` will be at ``np.iinfo("int64").min`` for all of the different representations. + +.. warning:: + When initialized with a datetime string this is only defined from ``-9999-01-01`` to ``9999-12-31``. + + .. ipython:: python + + try: + print("Works:", pd.Timestamp("-9999-01-01 00:00:00")) + print("Works, too:", pd.Timestamp("9999-12-31 23:59:59")) + print(pd.Timestamp("10000-01-01 00:00:00")) + except Exception as err: + print("Errors:", err) + +.. note:: + :py:class:`pandas.Timestamp` is the only current possibility to correctly import time reference strings. It handles non-ISO formatted strings, keeps the resolution of the strings (``'s'``, ``'ms'`` etc.) and imports time zones. When initialized with :py:class:`numpy.datetime64` instead of a string it even overcomes the above limitation of the possible time range. + + .. ipython:: python + + try: + print("Handles non-ISO:", pd.Timestamp("92-1-8 151542")) + print( + "Keeps resolution 1:", + pd.Timestamp("1992-10-08 15:15:42"), + pd.Timestamp("1992-10-08 15:15:42").unit, + ) + print( + "Keeps resolution 2:", + pd.Timestamp("1992-10-08 15:15:42.5"), + pd.Timestamp("1992-10-08 15:15:42.5").unit, + ) + print( + "Keeps timezone:", + pd.Timestamp("1992-10-08 15:15:42.5 -6:00"), + pd.Timestamp("1992-10-08 15:15:42.5 -6:00").unit, + ) + print( + "Extends timerange :", + pd.Timestamp(np.datetime64("-10000-10-08 15:15:42.5001")), + pd.Timestamp(np.datetime64("-10000-10-08 15:15:42.5001")).unit, + ) + except Exception as err: + print("Errors:", err) + +DatetimeIndex +~~~~~~~~~~~~~ + +:py:class:`pandas.DatetimeIndex` is used to wrap ``np.datetime64`` values or other datetime-likes when encoding. The resolution of the DatetimeIndex depends on the input, but can be only one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``. Lower resolution input is automatically converted to ``'s'``, higher resolution input is cut to ``'ns'``. +:py:class:`pandas.DatetimeIndex` will raise :py:class:`pandas.OutOfBoundsDatetime` if the input can't be represented in the given resolution. + +.. ipython:: python + + try: + print( + "Works:", + pd.DatetimeIndex( + np.array(["1992-01-08", "1992-01-09"], dtype="datetime64[D]") + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["1992-01-08 15:15:42", "1992-01-09 15:15:42"], + dtype="datetime64[s]", + ) + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["1992-01-08 15:15:42.5", "1992-01-09 15:15:42.0"], + dtype="datetime64[ms]", + ) + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["1970-01-01 00:00:00.401501601701801901", "1970-01-01 00:00:00"], + dtype="datetime64[as]", + ) + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["-10000-01-01 00:00:00.401501", "1970-01-01 00:00:00"], + dtype="datetime64[us]", + ) + ), + ) + except Exception as err: + print("Errors:", err) + +CF Conventions Time Handling +---------------------------- + +Xarray tries to adhere to the latest version of the `CF Conventions`_. Relevant is the section on `Time Coordinate`_ and the `Calendar`_ subsection. + +.. _CF Conventions: https://cfconventions.org +.. _Time Coordinate: https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#time-coordinate +.. _Calendar: https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#calendar + +CF time decoding +~~~~~~~~~~~~~~~~ + +Decoding of ``values`` with a time unit specification like ``"seconds since 1992-10-8 15:15:42.5 -6:00"`` into datetimes using the CF conventions is a multistage process. + +1. If we have a non-standard calendar (e.g. ``"noleap"``) decoding is done with the ``cftime`` package, which is not covered in this section. For the ``"standard"``/``"gregorian"`` calendar as well as the ``"proleptic_gregorian"`` calendar the above outlined pandas functionality is used. + +2. The ``"standard"``/``"gregorian"`` calendar and the ``"proleptic_gregorian"`` are equivalent for any dates and reference times >= ``"1582-10-15"``. First the reference time is checked and any timezone information stripped off. In a second step, the minimum and maximum ``values`` are checked if they can be represented in the current reference time resolution. At the same time integer overflow would be caught. For the ``"standard"``/``"gregorian"`` calendar the dates are checked to be >= ``"1582-10-15"``. If anything fails, the decoding is attempted with ``cftime``. + +3. As the unit (here ``"seconds"``) and the resolution of the reference time ``"1992-10-8 15:15:42.5 -6:00"`` (here ``"milliseconds"``) might be different, the decoding resolution is aligned to the higher resolution of the two. Users may also specify their wanted target resolution by setting the ``time_unit`` keyword argument to one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` (default ``'ns'``). This will be included in the alignment process. This is done by multiplying the ``values`` by the ratio of nanoseconds per time unit and nanoseconds per reference time unit. To retain consistency for ``NaT`` values a mask is kept and re-introduced after the multiplication. + +4. Times encoded as floating point values are checked for fractional parts and the resolution is enhanced in an iterative process until a fitting resolution (or ``'ns'``) is found. A ``SerializationWarning`` is issued to make the user aware of the possibly problematic encoding. + +5. Finally, the ``values`` (at this point converted to ``int64`` values) are cast to ``datetime64[unit]`` (using the above retrieved unit) and added to the reference time :py:class:`pandas.Timestamp`. + +.. ipython:: python + + calendar = "proleptic_gregorian" + values = np.array([-1000 * 365, 0, 1000 * 365], dtype="int64") + units = "days since 2000-01-01 00:00:00.000001" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + assert dt.dtype == "datetime64[us]" + dt + +.. ipython:: python + + units = "microseconds since 2000-01-01 00:00:00" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + assert dt.dtype == "datetime64[us]" + dt + +.. ipython:: python + + values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") + units = "days since 2000-01-01 00:00:00.001" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + assert dt.dtype == "datetime64[ms]" + dt + +.. ipython:: python + + values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") + units = "hours since 2000-01-01" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + assert dt.dtype == "datetime64[s]" + dt + +.. ipython:: python + + values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") + units = "hours since 2000-01-01 00:00:00 03:30" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + assert dt.dtype == "datetime64[s]" + dt + +.. ipython:: python + + values = np.array([-2002 * 365 - 121, -366, 365, 2000 * 365 + 119], dtype="int64") + units = "days since 0001-01-01 00:00:00" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + assert dt.dtype == "datetime64[s]" + dt + +CF time encoding +~~~~~~~~~~~~~~~~ + +For encoding the process is more or less a reversal of the above, but we have to make some decisions on default values. + +1. Infer ``data_units`` from the given ``dates``. +2. Infer ``units`` (either cleanup given ``units`` or use ``data_units`` +3. Infer the calendar name from the given ``dates``. +4. If dates are :py:class:`cftime.datetime` objects then encode with ``cftime.date2num`` +5. Retrieve ``time_units`` and ``ref_date`` from ``units`` +6. Check ``ref_date`` >= ``1582-10-15``, otherwise -> ``cftime`` +7. Wrap ``dates`` with pd.DatetimeIndex +8. Subtracting ``ref_date`` (:py:class:`pandas.Timestamp`) from above :py:class:`pandas.DatetimeIndex` will return :py:class:`pandas.TimedeltaIndex` +9. Align resolution of :py:class:`pandas.TimedeltaIndex` with resolution of ``time_units`` +10. Retrieve needed ``units`` and ``delta`` to faithfully encode into int64 +11. Divide ``time_deltas`` by ``delta``, use floor division (integer) or normal division (float) +12. Return result + +.. ipython:: python + :okwarning: + + calendar = "proleptic_gregorian" + dates = np.array( + [ + "-2000-01-01T00:00:00", + "0000-01-01T00:00:00", + "0002-01-01T00:00:00", + "2000-01-01T00:00:00", + ], + dtype="datetime64[s]", + ) + orig_values = np.array( + [-2002 * 365 - 121, -366, 365, 2000 * 365 + 119], dtype="int64" + ) + units = "days since 0001-01-01 00:00:00" + values, _, _ = xr.coding.times.encode_cf_datetime( + dates, units, calendar, dtype=np.dtype("int64") + ) + print(values) + np.testing.assert_array_equal(values, orig_values) + + dates = np.array( + [ + "-2000-01-01T01:00:00", + "0000-01-01T00:00:00", + "0002-01-01T00:00:00", + "2000-01-01T00:00:00", + ], + dtype="datetime64[s]", + ) + orig_values = np.array( + [-2002 * 365 - 121, -366, 365, 2000 * 365 + 119], dtype="int64" + ) + units = "days since 0001-01-01 00:00:00" + values, units, _ = xr.coding.times.encode_cf_datetime( + dates, units, calendar, dtype=np.dtype("int64") + ) + print(values, units) + +.. _internals.default_timeunit: + +Default Time Unit +~~~~~~~~~~~~~~~~~ + +The current default time unit of xarray is ``'ns'``. When setting keyword argument ``time_unit`` unit to ``'s'`` (the lowest resolution pandas allows) datetimes will be converted to at least ``'s'``-resolution, if possible. The same holds true for ``'ms'`` and ``'us'``. + +.. ipython:: python + + attrs = {"units": "hours since 2000-01-01"} + ds = xr.Dataset({"time": ("time", [0, 1, 2, 3], attrs)}) + ds.to_netcdf("test-datetimes1.nc") + +.. ipython:: python + + xr.open_dataset("test-datetimes1.nc") + +.. ipython:: python + + coder = xr.coders.CFDatetimeCoder(time_unit="s") + xr.open_dataset("test-datetimes1.nc", decode_times=coder) + +If a coarser unit is requested the datetimes are decoded into their native +on-disk resolution, if possible. + +.. ipython:: python + + attrs = {"units": "milliseconds since 2000-01-01"} + ds = xr.Dataset({"time": ("time", [0, 1, 2, 3], attrs)}) + ds.to_netcdf("test-datetimes2.nc") + +.. ipython:: python + + xr.open_dataset("test-datetimes2.nc") + +.. ipython:: python + + coder = xr.coders.CFDatetimeCoder(time_unit="s") + xr.open_dataset("test-datetimes2.nc", decode_times=coder) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 60ab1720ecf..986d43ce4b7 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -540,8 +540,8 @@ The ``units`` and ``calendar`` attributes control how xarray serializes ``dateti ``timedelta64`` arrays to datasets on disk as numeric values. The ``units`` encoding should be a string like ``'days since 1900-01-01'`` for ``datetime64`` data or a string like ``'days'`` for ``timedelta64`` data. ``calendar`` should be one of the calendar types -supported by netCDF4-python: 'standard', 'gregorian', 'proleptic_gregorian' 'noleap', -'365_day', '360_day', 'julian', 'all_leap', '366_day'. +supported by netCDF4-python: ``'standard'``, ``'gregorian'``, ``'proleptic_gregorian'``, ``'noleap'``, +``'365_day'``, ``'360_day'``, ``'julian'``, ``'all_leap'``, ``'366_day'``. By default, xarray uses the ``'proleptic_gregorian'`` calendar and units of the smallest time difference between values, with a reference time of the first time value. diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 8ec5dfea6c1..d131ae74b9f 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -21,9 +21,9 @@ core functionality. Creating datetime64 data ------------------------ -Xarray uses the numpy dtypes ``datetime64[ns]`` and ``timedelta64[ns]`` to -represent datetime data, which offer vectorized (if sometimes buggy) operations -with numpy and smooth integration with pandas. +Xarray uses the numpy dtypes ``datetime64[unit]`` and ``timedelta64[unit]`` +(where unit is one of ``"s"``, ``"ms"``, ``"us"`` and ``"ns"``) to represent datetime +data, which offer vectorized operations with numpy and smooth integration with pandas. To convert to or create regular arrays of ``datetime64`` data, we recommend using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: @@ -31,10 +31,30 @@ using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: .. ipython:: python pd.to_datetime(["2000-01-01", "2000-02-02"]) + pd.DatetimeIndex( + ["2000-01-01 00:00:00", "2000-02-02 00:00:00"], dtype="datetime64[s]" + ) pd.date_range("2000-01-01", periods=365) + pd.date_range("2000-01-01", periods=365, unit="s") + +It is also possible to use corresponding :py:func:`xarray.date_range`: + +.. ipython:: python + + xr.date_range("2000-01-01", periods=365) + xr.date_range("2000-01-01", periods=365, unit="s") + + +.. note:: + Care has to be taken to create the output with the wanted resolution. + For :py:func:`pandas.date_range` the ``unit``-kwarg has to be specified + and for :py:func:`pandas.to_datetime` the selection of the resolution + isn't possible at all. For that :py:class:`pd.DatetimeIndex` can be used + directly. There is more in-depth information in section + :ref:`internals.timecoding`. Alternatively, you can supply arrays of Python ``datetime`` objects. These get -converted automatically when used as arguments in xarray objects: +converted automatically when used as arguments in xarray objects (with us-resolution): .. ipython:: python @@ -51,12 +71,13 @@ attribute like ``'days since 2000-01-01'``). .. note:: When decoding/encoding datetimes for non-standard calendars or for dates - before year 1678 or after year 2262, xarray uses the `cftime`_ library. + before `1582-10-15`_, xarray uses the `cftime`_ library by default. It was previously packaged with the ``netcdf4-python`` package under the name ``netcdftime`` but is now distributed separately. ``cftime`` is an :ref:`optional dependency` of xarray. .. _cftime: https://unidata.github.io/cftime +.. _1582-10-15: https://en.wikipedia.org/wiki/Gregorian_calendar You can manual decode arrays in this form by passing a dataset to @@ -66,17 +87,15 @@ You can manual decode arrays in this form by passing a dataset to attrs = {"units": "hours since 2000-01-01"} ds = xr.Dataset({"time": ("time", [0, 1, 2, 3], attrs)}) + # Default decoding to 'ns'-resolution xr.decode_cf(ds) + # Decoding to 's'-resolution + coder = xr.coders.CFDatetimeCoder(time_unit="s") + xr.decode_cf(ds, decode_times=coder) -One unfortunate limitation of using ``datetime64[ns]`` is that it limits the -native representation of dates to those that fall between the years 1678 and -2262. When a netCDF file contains dates outside of these bounds, dates will be -returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` -will be used for indexing. :py:class:`~xarray.CFTimeIndex` enables a subset of -the indexing functionality of a :py:class:`pandas.DatetimeIndex` and is only -fully compatible with the standalone version of ``cftime`` (not the version -packaged with earlier versions ``netCDF4``). See :ref:`CFTimeIndex` for more -information. +From xarray 2025.01.2 the resolution of the dates can be one of ``"s"``, ``"ms"``, ``"us"`` or ``"ns"``. One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a store contains dates outside of these bounds (or dates < `1582-10-15`_ with a Gregorian, also known as standard, calendar), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. +:py:class:`~xarray.CFTimeIndex` enables most of the indexing functionality of a :py:class:`pandas.DatetimeIndex`. +See :ref:`CFTimeIndex` for more information. Datetime indexing ----------------- diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index 5cc7b2e5af9..ac50c27d233 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -10,7 +10,7 @@ Weather and climate data import xarray as xr -Xarray can leverage metadata that follows the `Climate and Forecast (CF) conventions`_ if present. Examples include :ref:`automatic labelling of plots` with descriptive names and units if proper metadata is present and support for non-standard calendars used in climate science through the ``cftime`` module(Explained in the :ref:`CFTimeIndex` section). There are also a number of :ref:`geosciences-focused projects that build on xarray`. +Xarray can leverage metadata that follows the `Climate and Forecast (CF) conventions`_ if present. Examples include :ref:`automatic labelling of plots` with descriptive names and units if proper metadata is present and support for non-standard calendars used in climate science through the ``cftime`` module (explained in the :ref:`CFTimeIndex` section). There are also a number of :ref:`geosciences-focused projects that build on xarray`. .. _Climate and Forecast (CF) conventions: https://cfconventions.org @@ -57,15 +57,14 @@ CF-compliant coordinate variables .. _CFTimeIndex: -Non-standard calendars and dates outside the nanosecond-precision range ------------------------------------------------------------------------ +Non-standard calendars and dates outside the precision range +------------------------------------------------------------ Through the standalone ``cftime`` library and a custom subclass of :py:class:`pandas.Index`, xarray supports a subset of the indexing functionality enabled through the standard :py:class:`pandas.DatetimeIndex` for dates from non-standard calendars commonly used in climate science or dates -using a standard calendar, but outside the `nanosecond-precision range`_ -(approximately between years 1678 and 2262). +using a standard calendar, but outside the `precision range`_ and dates prior to `1582-10-15`_. .. note:: @@ -75,18 +74,14 @@ using a standard calendar, but outside the `nanosecond-precision range`_ any of the following are true: - The dates are from a non-standard calendar - - Any dates are outside the nanosecond-precision range. + - Any dates are outside the nanosecond-precision range (prior xarray version 2025.01.2) + - Any dates are outside the time span limited by the resolution (from xarray version 2025.01.2) Otherwise pandas-compatible dates from a standard calendar will be - represented with the ``np.datetime64[ns]`` data type, enabling the use of a - :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[ns]`` - and their full set of associated features. + represented with the ``np.datetime64[unit]`` data type (where unit can be one of ``"s"``, ``"ms"``, ``"us"``, ``"ns"``), enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. As of pandas version 2.0.0, pandas supports non-nanosecond precision datetime - values. For the time being, xarray still automatically casts datetime values - to nanosecond-precision for backwards compatibility with older pandas - versions; however, this is something we would like to relax going forward. - See :issue:`7493` for more discussion. + values. From xarray version 2025.01.2 on, non-nanosecond precision datetime values are also supported in xarray (this can be parameterized via :py:class:`~xarray.coders.CFDatetimeCoder` and ``decode_times`` kwarg). See also :ref:`internals.timecoding`. For example, you can create a DataArray indexed by a time coordinate with dates from a no-leap calendar and a @@ -115,7 +110,7 @@ instance, we can create the same dates and DataArray we created above using: Mirroring pandas' method with the same name, :py:meth:`~xarray.infer_freq` allows one to infer the sampling frequency of a :py:class:`~xarray.CFTimeIndex` or a 1-D :py:class:`~xarray.DataArray` containing cftime objects. It also works transparently with -``np.datetime64[ns]`` and ``np.timedelta64[ns]`` data. +``np.datetime64`` and ``np.timedelta64`` data (with "s", "ms", "us" or "ns" resolution). .. ipython:: python @@ -137,7 +132,9 @@ Conversion between non-standard calendar and to/from pandas DatetimeIndexes is facilitated with the :py:meth:`xarray.Dataset.convert_calendar` method (also available as :py:meth:`xarray.DataArray.convert_calendar`). Here, like elsewhere in xarray, the ``use_cftime`` argument controls which datetime backend is used in the output. The default (``None``) is to -use ``pandas`` when possible, i.e. when the calendar is standard and dates are within 1678 and 2262. +use ``pandas`` when possible, i.e. when the calendar is ``standard``/``gregorian`` and dates starting with `1582-10-15`_. There is no such restriction when converting to a ``proleptic_gregorian`` calendar. + +.. _1582-10-15: https://en.wikipedia.org/wiki/Gregorian_calendar .. ipython:: python @@ -241,6 +238,6 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: da.resample(time="81min", closed="right", label="right", offset="3min").mean() -.. _nanosecond-precision range: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations +.. _precision range: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations .. _ISO 8601 standard: https://en.wikipedia.org/wiki/ISO_8601 .. _partial datetime string indexing: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#partial-string-indexing diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a19345b4ef6..95aa5a57438 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -19,9 +19,39 @@ What's New v2025.01.2 (unreleased) ----------------------- +This release brings non-nanosecond datetime resolution to xarray. In the +last couple of releases xarray has been prepared for that change. The code had +to be changed and adapted in numerous places, affecting especially the test suite. +The documentation has been updated accordingly and a new internal chapter +on :ref:`internals.timecoding` has been added. + +To make the transition as smooth as possible this is designed to be fully backwards +compatible, keeping the current default of ``'ns'`` resolution on decoding. +To opt-in decoding into other resolutions (``'us'``, ``'ms'`` or ``'s'``) the +new :py:class:`coders.CFDatetimeCoder` is used as parameter to ``decode_times`` +kwarg (see also :ref:`internals.default_timeunit`): + +.. code-block:: python + + coder = xr.coders.CFDatetimeCoder(time_unit="s") + ds = xr.open_dataset(filename, decode_times=coder) + +There might slight changes when encoding/decoding times as some warning and +error messages have been removed or rewritten. Xarray will now also allow +non-nanosecond datetimes (with ``'us'``, ``'ms'`` or ``'s'`` resolution) when +creating DataArray's from scratch, picking the lowest possible resolution: + +.. ipython:: python + + xr.DataArray(data=[np.datetime64("2000-01-01", "D")], dims=("time",)) + +In a future release the current default of ``'ns'`` resolution on decoding will +eventually be deprecated. + New Features ~~~~~~~~~~~~ - +- Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`). + By `Kai Mühlbauer `_ and `Spencer Clark `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -37,7 +67,8 @@ Bug fixes Documentation ~~~~~~~~~~~~~ - +- A chapter on :ref:`internals.timecoding` is added to the internal section (:pull:`9618`). + By `Kai Mühlbauer `_. Internal Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 2adcc57c6b9..3211b9efbae 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -775,7 +775,8 @@ def open_dataarray( be replaced by NA. This keyword may not be supported by all the backends. decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them encoded as numbers. + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or + leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -984,7 +985,8 @@ def open_datatree( This keyword may not be supported by all the backends. decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them encoded as numbers. + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or + leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -1210,7 +1212,8 @@ def open_groups( This keyword may not be supported by all the backends. decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them encoded as numbers. + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or + leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 50b048a8e29..f3ed6444904 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -64,7 +64,7 @@ from xarray.core.common import _contains_datetime_like_objects, is_np_datetime_like from xarray.core.pdcompat import ( count_not_none, - nanosecond_precision_timestamp, + default_precision_timestamp, ) from xarray.core.utils import attempt_import, emit_user_level_warning @@ -81,14 +81,6 @@ T_FreqStr = TypeVar("T_FreqStr", str, None) -def _nanosecond_precision_timestamp(*args, **kwargs): - # As of pandas version 3.0, pd.to_datetime(Timestamp(...)) will try to - # infer the appropriate datetime precision. Until xarray supports - # non-nanosecond precision times, we will use this constructor wrapper to - # explicitly create nanosecond-precision Timestamp objects. - return pd.Timestamp(*args, **kwargs).as_unit("ns") - - def get_date_type(calendar, use_cftime=True): """Return the cftime date type for a given calendar name.""" if TYPE_CHECKING: @@ -97,7 +89,7 @@ def get_date_type(calendar, use_cftime=True): cftime = attempt_import("cftime") if _is_standard_calendar(calendar) and not use_cftime: - return _nanosecond_precision_timestamp + return default_precision_timestamp calendars = { "noleap": cftime.DatetimeNoLeap, @@ -1427,10 +1419,8 @@ def date_range_like(source, calendar, use_cftime=None): if is_np_datetime_like(source.dtype): # We want to use datetime fields (datetime64 object don't have them) source_calendar = "standard" - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - source_start = nanosecond_precision_timestamp(source_start) - source_end = nanosecond_precision_timestamp(source_end) + source_start = default_precision_timestamp(source_start) + source_end = default_precision_timestamp(source_end) else: if isinstance(source, CFTimeIndex): source_calendar = source.calendar diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 596a51a0dcf..bd5f51551c7 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -581,13 +581,14 @@ def to_datetimeindex(self, unsafe=False): CFTimeIndex([2000-01-01 00:00:00, 2000-01-02 00:00:00], dtype='object', length=2, calendar='standard', freq=None) >>> times.to_datetimeindex() - DatetimeIndex(['2000-01-01', '2000-01-02'], dtype='datetime64[ns]', freq=None) + DatetimeIndex(['2000-01-01', '2000-01-02'], dtype='datetime64[us]', freq=None) """ if not self._data.size: return pd.DatetimeIndex([]) - nptimes = cftime_to_nptime(self) + # transform to us-resolution is needed for DatetimeIndex + nptimes = cftime_to_nptime(self, time_unit="us") calendar = infer_calendar_name(self) if calendar not in _STANDARD_CALENDARS and not unsafe: warnings.warn( diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 6d758ee8d87..adbec3b9063 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -5,7 +5,7 @@ from collections.abc import Callable, Hashable from datetime import datetime, timedelta from functools import partial -from typing import TYPE_CHECKING, Literal, Union, cast +from typing import TYPE_CHECKING, Union, cast import numpy as np import pandas as pd @@ -24,7 +24,7 @@ from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like from xarray.core.duck_array_ops import asarray, ravel, reshape from xarray.core.formatting import first_n_items, format_timestamp, last_item -from xarray.core.pdcompat import nanosecond_precision_timestamp, timestamp_as_unit +from xarray.core.pdcompat import default_precision_timestamp, timestamp_as_unit from xarray.core.utils import attempt_import, emit_user_level_warning from xarray.core.variable import Variable from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type @@ -38,7 +38,9 @@ from xarray.core.types import ( CFCalendar, + CFTimeDatetime, NPDatetimeUnitOptions, + PDDatetimeUnitOptions, T_DuckArray, ) @@ -102,6 +104,13 @@ def _is_numpy_compatible_time_range(times): tmin = times.min() tmax = times.max() try: + # before relaxing the nanosecond constrained + # this raised OutOfBoundsDatetime for + # times < 1678 and times > 2262 + # this isn't the case anymore for other resolutions like "s" + # now, we raise for dates before 1582-10-15 + _check_date_is_after_shift(tmin, "standard") + _check_date_is_after_shift(tmax, "standard") convert_time_or_go_back(tmin, pd.Timestamp) convert_time_or_go_back(tmax, pd.Timestamp) except pd.errors.OutOfBoundsDatetime: @@ -278,9 +287,7 @@ def _unpack_time_unit_and_ref_date( # processing in encode_cf_datetime time_unit, _ref_date = _unpack_netcdf_time_units(units) time_unit = _netcdf_to_numpy_timeunit(time_unit) - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - ref_date = nanosecond_precision_timestamp(_ref_date) + ref_date = pd.Timestamp(_ref_date) ref_date = _maybe_strip_tz_from_timestamp(ref_date) return time_unit, ref_date @@ -290,6 +297,7 @@ def _decode_cf_datetime_dtype( units: str, calendar: str | None, use_cftime: bool | None, + time_unit: PDDatetimeUnitOptions = "ns", ) -> np.dtype: # Verify that at least the first and last date can be decoded # successfully. Otherwise, tracebacks end up swallowed by @@ -300,7 +308,9 @@ def _decode_cf_datetime_dtype( ) try: - result = decode_cf_datetime(example_value, units, calendar, use_cftime) + result = decode_cf_datetime( + example_value, units, calendar, use_cftime, time_unit + ) except Exception as err: calendar_msg = ( "the default calendar" if calendar is None else f"calendar {calendar!r}" @@ -333,7 +343,7 @@ def _decode_datetime_with_cftime( def _check_date_for_units_since_refdate( - date, unit: str, ref_date: pd.Timestamp + date, unit: NPDatetimeUnitOptions, ref_date: pd.Timestamp ) -> pd.Timestamp: # check for out-of-bounds floats and raise if date > np.iinfo("int64").max or date < np.iinfo("int64").min: @@ -356,8 +366,71 @@ def _check_date_for_units_since_refdate( return pd.Timestamp("NaT") +def _check_timedelta_range(value, data_unit, time_unit): + if value > np.iinfo("int64").max or value < np.iinfo("int64").min: + OutOfBoundsTimedelta(f"Value {value} can't be represented as Timedelta.") + # on windows multiplying nan leads to RuntimeWarning + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "invalid value encountered in multiply", RuntimeWarning + ) + delta = value * np.timedelta64(1, data_unit) + if not np.isnan(delta): + # this will raise on dtype overflow for integer dtypes + if value.dtype.kind in "u" and not np.int64(delta) == value: + raise OutOfBoundsTimedelta( + "DType overflow in Datetime/Timedelta calculation." + ) + # this will raise on overflow if delta cannot be represented with the + # resolutions supported by pandas. + pd.to_timedelta(delta) + + +def _align_reference_date_and_unit( + ref_date: pd.Timestamp, unit: NPDatetimeUnitOptions +) -> pd.Timestamp: + # align to the highest needed resolution of ref_date or unit + if np.timedelta64(1, ref_date.unit) > np.timedelta64(1, unit): + # this will raise accordingly + # if data can't be represented in the higher resolution + return timestamp_as_unit(ref_date, cast(PDDatetimeUnitOptions, unit)) + return ref_date + + +def _check_date_is_after_shift( + date: pd.Timestamp | datetime | CFTimeDatetime, calendar: str +) -> None: + # if we have gregorian/standard we need to raise + # if we are outside the well-defined date range + # proleptic_gregorian and standard/gregorian are only equivalent + # if reference date and date range is >= 1582-10-15 + if calendar != "proleptic_gregorian": + if date < type(date)(1582, 10, 15): + raise OutOfBoundsDatetime( + f"Dates before 1582-10-15 cannot be decoded " + f"with pandas using {calendar!r} calendar: {date}" + ) + + +def _check_higher_resolution( + flat_num_dates: np.ndarray, + time_unit: PDDatetimeUnitOptions, +) -> tuple[np.ndarray, PDDatetimeUnitOptions]: + """Iterate until fitting resolution found.""" + res: list[PDDatetimeUnitOptions] = ["s", "ms", "us", "ns"] + new_units = res[res.index(time_unit) :] + for new_time_unit in new_units: + if not ((np.unique(flat_num_dates % 1) > 0).any() and new_time_unit != "ns"): + break + flat_num_dates *= 1000 + return flat_num_dates, new_time_unit + + def _decode_datetime_with_pandas( - flat_num_dates: np.ndarray, units: str, calendar: str + flat_num_dates: np.ndarray, + units: str, + calendar: str, + time_resolution: PDDatetimeUnitOptions = "ns", ) -> np.ndarray: if not _is_standard_calendar(calendar): raise OutOfBoundsDatetime( @@ -376,11 +449,16 @@ def _decode_datetime_with_pandas( try: time_unit, ref_date = _unpack_time_unit_and_ref_date(units) + ref_date = _align_reference_date_and_unit(ref_date, time_unit) + # here the highest wanted resolution is set + ref_date = _align_reference_date_and_unit(ref_date, time_resolution) except ValueError as err: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime from err + _check_date_is_after_shift(ref_date, calendar) + with warnings.catch_warnings(): warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) if flat_num_dates.size > 0: @@ -403,24 +481,12 @@ def _decode_datetime_with_pandas( elif flat_num_dates.dtype.kind in "f": flat_num_dates = flat_num_dates.astype(np.float64) - # keep NaT/nan mask - nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min) - # in case we need to change the unit, we fix the numbers here - # this should be safe, as errors would have been raised above - ns_time_unit = _NS_PER_TIME_DELTA[time_unit] - ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_date.unit] - if ns_time_unit > ns_ref_date_unit: - flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit) - time_unit = ref_date.unit - - # Cast input ordinals to integers and properly handle NaN/NaT - # to prevent casting NaN to int - flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64) - flat_num_dates_int[nan] = np.iinfo(np.int64).min - flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64) + timedeltas = _numbers_to_timedelta( + flat_num_dates, time_unit, ref_date.unit, "datetime" + ) - # cast to timedelta64[time_unit] and add to ref_date - return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_unit}]") + # add timedeltas to ref_date + return ref_date + timedeltas def decode_cf_datetime( @@ -428,6 +494,7 @@ def decode_cf_datetime( units: str, calendar: str | None = None, use_cftime: bool | None = None, + time_unit: PDDatetimeUnitOptions = "ns", ) -> np.ndarray: """Given an array of numeric dates in netCDF format, convert it into a numpy array of date time objects. @@ -450,59 +517,134 @@ def decode_cf_datetime( if use_cftime is None: try: - dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar) + dates = _decode_datetime_with_pandas( + flat_num_dates, units, calendar, time_unit + ) except (KeyError, OutOfBoundsDatetime, OutOfBoundsTimedelta, OverflowError): dates = _decode_datetime_with_cftime( flat_num_dates.astype(float), units, calendar ) # retrieve cftype dates_min = dates[np.nanargmin(num_dates)] + dates_max = dates[np.nanargmax(num_dates)] cftype = type(dates_min) + # create first day of gregorian calendar in current cf calendar type + border = cftype(1582, 10, 15) # "ns" borders # between ['1677-09-21T00:12:43.145224193', '2262-04-11T23:47:16.854775807'] lower = cftype(1677, 9, 21, 0, 12, 43, 145224) upper = cftype(2262, 4, 11, 23, 47, 16, 854775) - if dates_min < lower or dates[np.nanargmax(num_dates)] > upper: + if dates_min < border: if _is_standard_calendar(calendar): - warnings.warn( + emit_user_level_warning( "Unable to decode time axis into full " "numpy.datetime64 objects, continuing using " - "cftime.datetime objects instead, reason: dates out " - "of range", + "cftime.datetime objects instead, reason: dates prior " + "reform date (1582-10-15). To silence this warning specify " + "'use_cftime=True'.", SerializationWarning, - stacklevel=3, ) + elif time_unit == "ns" and (dates_min < lower or dates_max > upper): + emit_user_level_warning( + "Unable to decode time axis into full " + "numpy.datetime64[ns] objects, continuing using " + "cftime.datetime objects instead, reason: dates out " + "of range. To silence this warning use a coarser resolution " + "'time_unit' or specify 'use_cftime=True'.", + SerializationWarning, + ) else: if _is_standard_calendar(calendar): - dates = cftime_to_nptime(dates) + dates = cftime_to_nptime(dates, time_unit=time_unit) elif use_cftime: dates = _decode_datetime_with_cftime(flat_num_dates, units, calendar) else: - dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar) + dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar, time_unit) return reshape(dates, num_dates.shape) -def to_timedelta_unboxed(value, **kwargs): - result = pd.to_timedelta(value, **kwargs).to_numpy() - assert result.dtype == "timedelta64[ns]" - return result - - def to_datetime_unboxed(value, **kwargs): result = pd.to_datetime(value, **kwargs).to_numpy() - assert result.dtype == "datetime64[ns]" + assert np.issubdtype(result.dtype, "datetime64") return result -def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: +def _numbers_to_timedelta( + flat_num: np.ndarray, + time_unit: NPDatetimeUnitOptions, + ref_unit: PDDatetimeUnitOptions, + datatype: str, +) -> np.ndarray: + """Transform numbers to np.timedelta64.""" + # keep NaT/nan mask + nan = np.isnan(flat_num) | (flat_num == np.iinfo(np.int64).min) + + # in case we need to change the unit, we fix the numbers here + # this should be safe, as errors would have been raised above + ns_time_unit = _NS_PER_TIME_DELTA[time_unit] + ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_unit] + if ns_time_unit > ns_ref_date_unit: + flat_num *= np.int64(ns_time_unit / ns_ref_date_unit) + time_unit = ref_unit + + # estimate fitting resolution for floating point values + # this iterates until all floats are fractionless or time_unit == "ns" + if flat_num.dtype.kind == "f" and time_unit != "ns": + flat_num_dates, new_time_unit = _check_higher_resolution(flat_num, time_unit) # type: ignore[arg-type] + if time_unit != new_time_unit: + msg = ( + f"Can't decode floating point {datatype} to {time_unit!r} without " + f"precision loss, decoding to {new_time_unit!r} instead. " + f"To silence this warning use time_unit={new_time_unit!r} in call to " + f"decoding function." + ) + emit_user_level_warning(msg, SerializationWarning) + time_unit = new_time_unit + + # Cast input ordinals to integers and properly handle NaN/NaT + # to prevent casting NaN to int + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + flat_num = flat_num.astype(np.int64) + flat_num[nan] = np.iinfo(np.int64).min + + # cast to wanted type + return flat_num.astype(f"timedelta64[{time_unit}]") + + +def decode_cf_timedelta( + num_timedeltas, units: str, time_unit: PDDatetimeUnitOptions = "ns" +) -> np.ndarray: """Given an array of numeric timedeltas in netCDF format, convert it into a - numpy timedelta64[ns] array. + numpy timedelta64 ["s", "ms", "us", "ns"] array. """ num_timedeltas = np.asarray(num_timedeltas) - units = _netcdf_to_numpy_timeunit(units) - result = to_timedelta_unboxed(ravel(num_timedeltas), unit=units) + unit = _netcdf_to_numpy_timeunit(units) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "All-NaN slice encountered", RuntimeWarning) + _check_timedelta_range(np.nanmin(num_timedeltas), unit, time_unit) + _check_timedelta_range(np.nanmax(num_timedeltas), unit, time_unit) + + timedeltas = _numbers_to_timedelta(num_timedeltas, unit, "s", "timedelta") + pd_timedeltas = pd.to_timedelta(ravel(timedeltas)) + + if np.isnat(timedeltas).all(): + empirical_unit = time_unit + else: + empirical_unit = pd_timedeltas.unit + + if np.timedelta64(1, time_unit) > np.timedelta64(1, empirical_unit): + time_unit = empirical_unit + + if time_unit not in {"s", "ms", "us", "ns"}: + raise ValueError( + f"time_unit must be one of 's', 'ms', 'us', or 'ns'. Got: {time_unit}" + ) + + result = pd_timedeltas.as_unit(time_unit).to_numpy() return reshape(result, num_timedeltas.shape) @@ -512,10 +654,11 @@ def _unit_timedelta_cftime(units: str) -> timedelta: def _unit_timedelta_numpy(units: str) -> np.timedelta64: numpy_units = _netcdf_to_numpy_timeunit(units) - return np.timedelta64(_NS_PER_TIME_DELTA[numpy_units], "ns") + return np.timedelta64(1, numpy_units) def _infer_time_units_from_diff(unique_timedeltas) -> str: + # todo: check, if this function works correctly wrt np.timedelta64 unit_timedelta: Callable[[str], timedelta] | Callable[[str], np.timedelta64] zero_timedelta: timedelta | np.timedelta64 if unique_timedeltas.dtype == np.dtype("O"): @@ -532,10 +675,6 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str: return "seconds" -def _time_units_to_timedelta64(units: str) -> np.timedelta64: - return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype("timedelta64[ns]") - - def infer_calendar_name(dates) -> CFCalendar: """Given an array of datetimes, infer the CF calendar name""" if is_np_datetime_like(dates.dtype): @@ -562,13 +701,11 @@ def infer_datetime_units(dates) -> str: unique time deltas in `dates`) """ dates = ravel(np.asarray(dates)) - if np.asarray(dates).dtype == "datetime64[ns]": + if np.issubdtype(np.asarray(dates).dtype, "datetime64"): dates = to_datetime_unboxed(dates) dates = dates[pd.notnull(dates)] reference_date = dates[0] if len(dates) > 0 else "1970-01-01" - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - reference_date = nanosecond_precision_timestamp(reference_date) + reference_date = pd.Timestamp(reference_date) else: reference_date = dates[0] if len(dates) > 0 else "1970-01-01" reference_date = format_cftime_datetime(reference_date) @@ -589,30 +726,28 @@ def infer_timedelta_units(deltas) -> str: {'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly divide all unique time deltas in `deltas`) """ - deltas = to_timedelta_unboxed(ravel(np.asarray(deltas))) + deltas = ravel(deltas) unique_timedeltas = np.unique(deltas[pd.notnull(deltas)]) return _infer_time_units_from_diff(unique_timedeltas) -def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: +def cftime_to_nptime( + times, raise_on_invalid: bool = True, time_unit: PDDatetimeUnitOptions = "ns" +) -> np.ndarray: """Given an array of cftime.datetime objects, return an array of numpy.datetime64 objects of the same size If raise_on_invalid is True (default), invalid dates trigger a ValueError. Otherwise, the invalid element is replaced by np.NaT.""" times = np.asarray(times) - # TODO: the strict enforcement of nanosecond precision datetime values can - # be relaxed when addressing GitHub issue #7493. - new = np.empty(times.shape, dtype="M8[ns]") - dt: pd.Timestamp | Literal["NaT"] - for i, t in np.ndenumerate(times): + new = [] + dt: np.datetime64 + for _i, t in np.ndenumerate(times): try: - # Use pandas.Timestamp in place of datetime.datetime, because - # NumPy casts it safely it np.datetime64[ns] for dates outside - # 1678 to 2262 (this is not currently the case for - # datetime.datetime). - dt = nanosecond_precision_timestamp( - t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond + # We expect either "us" resolution or "s" resolution depending on + # whether 'microseconds' are defined for the input or not. + dt = ( + pd.Timestamp(np.datetime64(t.isoformat())).as_unit(time_unit).to_numpy() ) except ValueError as e: if raise_on_invalid: @@ -621,9 +756,9 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: f"standard calendar. Reason: {e}." ) from e else: - dt = "NaT" - new[i] = np.datetime64(dt) - return new + dt = np.datetime64("NaT") + new.append(dt) + return np.asarray(new).reshape(times.shape) def convert_times(times, date_type, raise_on_invalid: bool = True) -> np.ndarray: @@ -668,10 +803,8 @@ def convert_time_or_go_back(date, date_type): This is meant to convert end-of-month dates into a new calendar. """ - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. if date_type == pd.Timestamp: - date_type = nanosecond_precision_timestamp + date_type = default_precision_timestamp try: return date_type( date.year, @@ -765,6 +898,22 @@ def _encode_datetime_with_cftime(dates, units: str, calendar: str) -> np.ndarray # numpy's broken datetime conversion only works for us precision dates = dates.astype("M8[us]").astype(datetime) + def wrap_dt(dt): + # convert to cftime proleptic gregorian in case of datetime.datetime + # needed because of https://github.com/Unidata/cftime/issues/354 + if isinstance(dt, datetime) and not isinstance(dt, cftime.datetime): + dt = cftime.datetime( + dt.year, + dt.month, + dt.day, + dt.hour, + dt.minute, + dt.second, + dt.microsecond, + calendar="proleptic_gregorian", + ) + return dt + def encode_datetime(d): # Since netCDF files do not support storing float128 values, we ensure # that float64 values are used by setting longdouble=False in num2date. @@ -774,10 +923,10 @@ def encode_datetime(d): return ( np.nan if d is None - else cftime.date2num(d, units, calendar, longdouble=False) + else cftime.date2num(wrap_dt(d), units, calendar, longdouble=False) ) except TypeError: - return np.nan if d is None else cftime.date2num(d, units, calendar) + return np.nan if d is None else cftime.date2num(wrap_dt(d), units, calendar) return reshape(np.array([encode_datetime(d) for d in ravel(dates)]), dates.shape) @@ -866,9 +1015,7 @@ def _eagerly_encode_cf_datetime( allow_units_modification: bool = True, ) -> tuple[T_DuckArray, str, str]: dates = asarray(dates) - data_units = infer_datetime_units(dates) - if units is None: units = data_units else: @@ -881,14 +1028,30 @@ def _eagerly_encode_cf_datetime( if not _is_standard_calendar(calendar) or dates.dtype.kind == "O": # parse with cftime instead raise OutOfBoundsDatetime - assert dates.dtype == "datetime64[ns]" + assert np.issubdtype(dates.dtype, "datetime64") + if calendar in ["standard", "gregorian"] and np.nanmin(dates).astype( + "=M8[us]" + ).astype(datetime) < datetime(1582, 10, 15): + # if we use standard calendar and for dates before the reform + # we need to use cftime instead + emit_user_level_warning( + f"Unable to encode numpy.datetime64 objects with {calendar} calendar." + "Using cftime.datetime objects instead, reason: dates prior " + "reform date (1582-10-15). To silence this warning transform " + "numpy.datetime64 to corresponding cftime.datetime beforehand.", + SerializationWarning, + ) + raise OutOfBoundsDatetime time_unit, ref_date = _unpack_time_unit_and_ref_date(units) + # calendar equivalence only for days after the reform + _check_date_is_after_shift(ref_date, calendar) time_delta = np.timedelta64(1, time_unit) # Wrap the dates in a DatetimeIndex to do the subtraction to ensure # an OverflowError is raised if the ref_date is too far away from # dates to be encoded (GH 2272). + # DatetimeIndex will convert to units of ["s", "ms", "us", "ns"] dates_as_index = pd.DatetimeIndex(ravel(dates)) time_deltas = dates_as_index - ref_date @@ -941,6 +1104,7 @@ def _eagerly_encode_cf_datetime( num = cast_to_int_if_safe(num) if dtype is not None: + # todo: check, if this is really needed for all dtypes num = _cast_to_dtype_if_safe(num, dtype) return num, units, calendar @@ -1015,12 +1179,14 @@ def _eagerly_encode_cf_timedelta( allow_units_modification: bool = True, ) -> tuple[T_DuckArray, str]: data_units = infer_timedelta_units(timedeltas) - if units is None: units = data_units - time_delta = _time_units_to_timedelta64(units) + time_delta = _unit_timedelta_numpy(units) time_deltas = pd.TimedeltaIndex(ravel(timedeltas)) + # get resolution of TimedeltaIndex and align time_delta + deltas_unit = time_deltas.unit + time_delta = time_delta.astype(f"=m8[{deltas_unit}]") # retrieve needed units to faithfully encode to int64 needed_units = data_units @@ -1028,7 +1194,7 @@ def _eagerly_encode_cf_timedelta( needed_units = _infer_time_units_from_diff(np.unique(time_deltas.dropna())) # needed time delta to encode faithfully to int64 - needed_time_delta = _time_units_to_timedelta64(needed_units) + needed_time_delta = _unit_timedelta_numpy(needed_units) floor_division = np.issubdtype(dtype, np.integer) or dtype is None if time_delta > needed_time_delta: @@ -1049,12 +1215,14 @@ def _eagerly_encode_cf_timedelta( ) units = needed_units time_delta = needed_time_delta + time_delta = time_delta.astype(f"=m8[{deltas_unit}]") floor_division = True num = _division(time_deltas, time_delta, floor_division) num = reshape(num.values, timedeltas.shape) if dtype is not None: + # todo: check, if this is needed for all dtypes num = _cast_to_dtype_if_safe(num, dtype) return num, units @@ -1099,11 +1267,32 @@ def _lazily_encode_cf_timedelta( class CFDatetimeCoder(VariableCoder): + """Coder for CF Datetime coding. + + Parameters + ---------- + use_cftime : bool, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64`` objects. If False, always + decode times to ``np.datetime64`` objects; if this is not possible + raise an error. + May not be supported by all the backends. + time_unit : PDDatetimeUnitOptions + Target resolution when decoding dates. Defaults to "ns". + """ + def __init__( self, use_cftime: bool | None = None, + time_unit: PDDatetimeUnitOptions = "ns", ) -> None: self.use_cftime = use_cftime + self.time_unit = time_unit def encode(self, variable: Variable, name: T_Name = None) -> Variable: if np.issubdtype( @@ -1130,12 +1319,15 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: units = pop_to(attrs, encoding, "units") calendar = pop_to(attrs, encoding, "calendar") - dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime) + dtype = _decode_cf_datetime_dtype( + data, units, calendar, self.use_cftime, self.time_unit + ) transform = partial( decode_cf_datetime, units=units, calendar=calendar, use_cftime=self.use_cftime, + time_unit=self.time_unit, ) data = lazy_elemwise_func(data, transform, dtype) @@ -1165,6 +1357,7 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: units = pop_to(attrs, encoding, "units") transform = partial(decode_cf_timedelta, units=units) + # todo: check, if we can relax this one here, too dtype = np.dtype("timedelta64[ns]") data = lazy_elemwise_func(data, transform, dtype=dtype) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 8154f044332..83112628dbb 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -46,7 +46,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: raise NotImplementedError() def decode(self, variable: Variable, name: T_Name = None) -> Variable: - """Convert an decoded variable to a encoded variable""" + """Convert a decoded variable to an encoded variable""" raise NotImplementedError() diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index f185a05c2b9..51fc4a00421 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1753,8 +1753,10 @@ def _convert_scalar(self, item): # pd.Timestamp rather np.than datetime64 but this is easier # (for now) item = np.datetime64("NaT", "ns") + elif isinstance(item, pd.Timedelta): + item = item.to_numpy() elif isinstance(item, timedelta): - item = np.timedelta64(getattr(item, "value", item), "ns") + item = np.timedelta64(item) elif isinstance(item, pd.Timestamp): # Work around for GH: pydata/xarray#1932 and numpy/numpy#10668 # numpy fails to convert pd.Timestamp to np.datetime64[ns] diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index 4c54715e2a0..e7a36574fcd 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -39,7 +39,6 @@ from typing import Literal import pandas as pd -from packaging.version import Version from xarray.core.types import PDDatetimeUnitOptions @@ -89,13 +88,12 @@ def timestamp_as_unit(date: pd.Timestamp, unit: PDDatetimeUnitOptions) -> pd.Tim return date -def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp: - """Return a nanosecond-precision Timestamp object. +def default_precision_timestamp(*args, **kwargs) -> pd.Timestamp: + """Return a Timestamp object with the default precision. - Note this function should no longer be needed after addressing GitHub issue - #7493. + Xarray default is "ns". """ - if Version(pd.__version__) >= Version("2.0.0"): - return pd.Timestamp(*args, **kwargs).as_unit("ns") - else: - return pd.Timestamp(*args, **kwargs) + dt = pd.Timestamp(*args, **kwargs) + if dt.unit != "ns": + dt = timestamp_as_unit(dt, "ns") + return dt diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 9d56555f31b..088c5f405ef 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -78,16 +78,6 @@ from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint -NON_NANOSECOND_WARNING = ( - "Converting non-nanosecond precision {case} values to nanosecond precision. " - "This behavior can eventually be relaxed in xarray, as it is an artifact from " - "pandas which is now beginning to support non-nanosecond precision values. " - "This warning is caused by passing non-nanosecond np.datetime64 or " - "np.timedelta64 values to the DataArray or Variable constructor; it can be " - "silenced by converting the values to nanosecond precision ahead of time." -) - - class MissingDimensionsError(ValueError): """Error class used when we can't safely guess a dimension name.""" @@ -205,51 +195,16 @@ def _maybe_wrap_data(data): return data -def _as_nanosecond_precision(data): - dtype = data.dtype - non_ns_datetime64 = ( - dtype.kind == "M" - and isinstance(dtype, np.dtype) - and dtype != np.dtype("datetime64[ns]") - ) - non_ns_datetime_tz_dtype = ( - isinstance(dtype, pd.DatetimeTZDtype) and dtype.unit != "ns" - ) - if non_ns_datetime64 or non_ns_datetime_tz_dtype: - utils.emit_user_level_warning(NON_NANOSECOND_WARNING.format(case="datetime")) - if isinstance(dtype, pd.DatetimeTZDtype): - nanosecond_precision_dtype = pd.DatetimeTZDtype("ns", dtype.tz) - else: - nanosecond_precision_dtype = "datetime64[ns]" - return duck_array_ops.astype(data, nanosecond_precision_dtype) - elif dtype.kind == "m" and dtype != np.dtype("timedelta64[ns]"): - utils.emit_user_level_warning(NON_NANOSECOND_WARNING.format(case="timedelta")) - return duck_array_ops.astype(data, "timedelta64[ns]") - else: - return data - - def _possibly_convert_objects(values): - """Convert arrays of datetime.datetime and datetime.timedelta objects into - datetime64 and timedelta64, according to the pandas convention. + """Convert object arrays into datetime64 and timedelta64 according + to the pandas convention. * datetime.datetime * datetime.timedelta * pd.Timestamp * pd.Timedelta - - For the time being, convert any non-nanosecond precision DatetimeIndex or - TimedeltaIndex objects to nanosecond precision. While pandas is relaxing this - in version 2.0.0, in xarray we will need to make sure we are ready to handle - non-nanosecond precision datetimes or timedeltas in our code before allowing - such values to pass through unchanged. Converting to nanosecond precision - through pandas.Series objects ensures that datetimes and timedeltas are - within the valid date range for ns precision, as pandas will raise an error - if they are not. """ as_series = pd.Series(values.ravel(), copy=False) - if as_series.dtype.kind in "mM": - as_series = _as_nanosecond_precision(as_series) result = np.asarray(as_series).reshape(values.shape) if not result.flags.writeable: # GH8843, pandas copy-on-write mode creates read-only arrays by default @@ -260,28 +215,13 @@ def _possibly_convert_objects(values): return result -def _possibly_convert_datetime_or_timedelta_index(data): - """For the time being, convert any non-nanosecond precision DatetimeIndex or - TimedeltaIndex objects to nanosecond precision. While pandas is relaxing - this in version 2.0.0, in xarray we will need to make sure we are ready to - handle non-nanosecond precision datetimes or timedeltas in our code - before allowing such values to pass through unchanged.""" - if isinstance(data, PandasIndexingAdapter): - if isinstance(data.array, pd.DatetimeIndex | pd.TimedeltaIndex): - data = PandasIndexingAdapter(_as_nanosecond_precision(data.array)) - elif isinstance(data, pd.DatetimeIndex | pd.TimedeltaIndex): - data = _as_nanosecond_precision(data) - return data - - def as_compatible_data( data: T_DuckArray | ArrayLike, fastpath: bool = False ) -> T_DuckArray: """Prepare and wrap data to put in a Variable. - If data does not have the necessary attributes, convert it to ndarray. - - If data has dtype=datetime64, ensure that it has ns precision. If it's a - pandas.Timestamp, convert it to datetime64. + - If it's a pandas.Timestamp, convert it to datetime64. - If data is already a pandas or xarray object (other than an Index), just use the values. @@ -301,7 +241,6 @@ def as_compatible_data( return cast("T_DuckArray", data._variable._data) def convert_non_numpy_type(data): - data = _possibly_convert_datetime_or_timedelta_index(data) return cast("T_DuckArray", _maybe_wrap_data(data)) if isinstance(data, NON_NUMPY_SUPPORTED_ARRAY_TYPES): @@ -361,10 +300,13 @@ def _as_array_or_item(data): """ data = np.asarray(data) if data.ndim == 0: - if data.dtype.kind == "M": - data = np.datetime64(data, "ns") - elif data.dtype.kind == "m": - data = np.timedelta64(data, "ns") + kind = data.dtype.kind + if kind in "mM": + unit, _ = np.datetime_data(data.dtype) + if kind == "M": + data = np.datetime64(data, unit) + elif kind == "m": + data = np.timedelta64(data, unit) return data diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 1f2eedcd8f0..48a5e8c4b66 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -318,7 +318,14 @@ def create_test_data( f'Not enough letters for filling this dimension size ({_dims["dim3"]})' ) obj["dim3"] = ("dim3", list(string.ascii_lowercase[0 : _dims["dim3"]])) - obj["time"] = ("time", pd.date_range("2000-01-01", periods=20)) + obj["time"] = ( + "time", + pd.date_range( + "2000-01-01", + periods=20, + unit="ns", + ), + ) for v, dims in sorted(_vars.items()): data = rs.normal(size=tuple(_dims[d] for d in dims)) obj[v] = (dims, data) diff --git a/xarray/tests/conftest.py b/xarray/tests/conftest.py index 97de58c4af2..c3f1ccbfe3c 100644 --- a/xarray/tests/conftest.py +++ b/xarray/tests/conftest.py @@ -220,3 +220,8 @@ def simple_datatree(create_test_datatree): Returns a DataTree. """ return create_test_datatree() + + +@pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) +def time_unit(request): + return request.param diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index cfca5e69048..72078da11b9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -613,6 +613,12 @@ def test_roundtrip_cftime_datetime_data(self) -> None: warnings.filterwarnings("ignore", "Unable to decode time axis") with self.roundtrip(expected, save_kwargs=kwargs) as actual: + # proleptic gregorian will be decoded into numpy datetime64 + # fixing to expectations + if actual.t.dtype.kind == "M": + dtype = actual.t.dtype + expected_decoded_t = expected_decoded_t.astype(dtype) + expected_decoded_t0 = expected_decoded_t0.astype(dtype) abs_diff = abs(actual.t.values - expected_decoded_t) assert (abs_diff <= np.timedelta64(1, "s")).all() assert ( @@ -627,7 +633,11 @@ def test_roundtrip_cftime_datetime_data(self) -> None: assert actual.t.encoding["calendar"] == expected_calendar def test_roundtrip_timedelta_data(self) -> None: - time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]) # type: ignore[arg-type, unused-ignore] + # todo: suggestion from review: + # roundtrip large microsecond or coarser resolution timedeltas, + # though we cannot test that until we fix the timedelta decoding + # to support large ranges + time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s") # type: ignore[arg-type, unused-ignore] expected = Dataset({"td": ("td", time_deltas), "td0": time_deltas[0]}) with self.roundtrip(expected) as actual: assert_identical(expected, actual) @@ -1623,8 +1633,7 @@ def test_open_encodings(self) -> None: ds.variables["time"][:] = np.arange(10) + 4 expected = Dataset() - - time = pd.date_range("1999-01-05", periods=10) + time = pd.date_range("1999-01-05", periods=10, unit="ns") encoding = {"units": units, "dtype": np.dtype("int32")} expected["time"] = ("time", time, {}, encoding) @@ -5613,16 +5622,14 @@ def test_use_cftime_standard_calendar_default_in_range(calendar) -> None: @requires_cftime @requires_scipy_or_netCDF4 -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) -def test_use_cftime_standard_calendar_default_out_of_range( - calendar, units_year -) -> None: +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +def test_use_cftime_standard_calendar_default_out_of_range(calendar) -> None: + # todo: check, if we still need to test for two dates import cftime x = [0, 1] time = [0, 720] - units = f"days since {units_year}-01-01" + units = "days since 1582-01-01" original = DataArray(x, [("time", time)], name="x").to_dataset() for v in ["x", "time"]: original[v].attrs["units"] = units @@ -5700,19 +5707,19 @@ def test_use_cftime_false_standard_calendar_in_range(calendar) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with warnings.catch_warnings(record=True) as record: - with open_dataset(tmp_file, use_cftime=False) as ds: + coder = xr.coders.CFDatetimeCoder(use_cftime=False) + with open_dataset(tmp_file, decode_times=coder) as ds: assert_identical(expected_x, ds.x) assert_identical(expected_time, ds.time) _assert_no_dates_out_of_range_warning(record) @requires_scipy_or_netCDF4 -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) -def test_use_cftime_false_standard_calendar_out_of_range(calendar, units_year) -> None: +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +def test_use_cftime_false_standard_calendar_out_of_range(calendar) -> None: x = [0, 1] time = [0, 720] - units = f"days since {units_year}-01-01" + units = "days since 1582-01-01" original = DataArray(x, [("time", time)], name="x").to_dataset() for v in ["x", "time"]: original[v].attrs["units"] = units @@ -5814,7 +5821,9 @@ def test_open_fsspec() -> None: mm = m.get_mapper("out1.zarr") ds.to_zarr(mm) # old interface ds0 = ds.copy() - ds0["time"] = ds.time + pd.to_timedelta("1 day") + # pd.to_timedelta returns ns-precision, but the example data is in second precision + # so we need to fix this + ds0["time"] = ds.time + np.timedelta64(1, "D") mm = m.get_mapper("out2.zarr") ds0.to_zarr(mm) # old interface diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 1ab6c611aac..4db73048548 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -1480,7 +1480,7 @@ def test_date_range_like_same_calendar(): assert src is out -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") def test_date_range_like_errors(): src = date_range("1899-02-03", periods=20, freq="D", use_cftime=False) src = src[np.arange(20) != 10] # Remove 1 day so the frequency is not inferable. diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index 2f527bf298e..8fc79a0cc53 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -1221,7 +1221,7 @@ def test_strftime_of_cftime_array(calendar): @pytest.mark.parametrize("unsafe", [False, True]) def test_to_datetimeindex(calendar, unsafe): index = xr.cftime_range("2000", periods=5, calendar=calendar) - expected = pd.date_range("2000", periods=5) + expected = pd.date_range("2000", periods=5, unit="us") if calendar in _NON_STANDARD_CALENDARS and not unsafe: with pytest.warns(RuntimeWarning, match="non-standard"): @@ -1238,7 +1238,15 @@ def test_to_datetimeindex(calendar, unsafe): @pytest.mark.parametrize("calendar", _ALL_CALENDARS) def test_to_datetimeindex_out_of_range(calendar): index = xr.cftime_range("0001", periods=5, calendar=calendar) - with pytest.raises(ValueError, match="0001"): + # todo: suggestion from code review: + # - still warn when converting from a non-standard calendar + # to a proleptic Gregorian calendar + # - also warn when converting from a Gregorian calendar + # to a proleptic Gregorian calendar when dates fall before the reform + if calendar in _NON_STANDARD_CALENDARS: + with pytest.warns(RuntimeWarning, match="non-standard"): + index.to_datetimeindex() + else: index.to_datetimeindex() @@ -1262,7 +1270,8 @@ def test_multiindex(): @pytest.mark.parametrize("freq", ["3663s", "33min", "2h"]) @pytest.mark.parametrize("method", ["floor", "ceil", "round"]) def test_rounding_methods_against_datetimeindex(freq, method): - expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777s") + # for now unit="us" seems good enough + expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777s", unit="us") expected = getattr(expected, method)(freq) result = xr.cftime_range("2000-01-02T01:03:51", periods=10, freq="1777s") result = getattr(result, method)(freq).to_datetimeindex() diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index e05d303e17b..44c0157f1b2 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1,7 +1,7 @@ from __future__ import annotations import warnings -from datetime import timedelta +from datetime import datetime, timedelta from itertools import product from typing import Literal @@ -34,11 +34,11 @@ format_cftime_datetime, infer_datetime_units, infer_timedelta_units, - to_timedelta_unboxed, ) from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder from xarray.core.common import contains_cftime_datetimes +from xarray.core.types import PDDatetimeUnitOptions from xarray.core.utils import is_duck_dask_array from xarray.testing import assert_equal, assert_identical from xarray.tests import ( @@ -124,23 +124,20 @@ def _all_cftime_date_types(): @pytest.mark.filterwarnings("ignore:Times can't be serialized faithfully") @pytest.mark.parametrize(["num_dates", "units", "calendar"], _CF_DATETIME_TESTS) def test_cf_datetime( - num_dates, - units, - calendar, + num_dates, units, calendar, time_unit: PDDatetimeUnitOptions ) -> None: import cftime expected = cftime.num2date( num_dates, units, calendar, only_use_cftime_datetimes=True ) - min_y = np.ravel(np.atleast_1d(expected))[np.nanargmin(num_dates)].year - max_y = np.ravel(np.atleast_1d(expected))[np.nanargmax(num_dates)].year - if min_y >= 1678 and max_y < 2262: - expected = cftime_to_nptime(expected) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(num_dates, units, calendar) + actual = decode_cf_datetime(num_dates, units, calendar, time_unit=time_unit) + + if actual.dtype.kind != "O": + expected = cftime_to_nptime(expected, time_unit=time_unit) abs_diff = np.asarray(abs(actual - expected)).ravel() abs_diff = pd.to_timedelta(abs_diff.tolist()).to_numpy() @@ -150,6 +147,7 @@ def test_cf_datetime( # https://github.com/Unidata/netcdf4-python/issues/355 assert (abs_diff <= np.timedelta64(1, "s")).all() encoded1, _, _ = encode_cf_datetime(actual, units, calendar) + assert_duckarray_allclose(num_dates, encoded1) if hasattr(num_dates, "ndim") and num_dates.ndim == 1 and "1000" not in units: @@ -161,7 +159,7 @@ def test_cf_datetime( @requires_cftime -def test_decode_cf_datetime_overflow() -> None: +def test_decode_cf_datetime_overflow(time_unit: PDDatetimeUnitOptions) -> None: # checks for # https://github.com/pydata/pandas/issues/14068 # https://github.com/pydata/xarray/issues/975 @@ -173,12 +171,18 @@ def test_decode_cf_datetime_overflow() -> None: # date after 2262 and before 1678 days = (-117710, 95795) expected = (datetime(1677, 9, 20), datetime(2262, 4, 12)) - for i, day in enumerate(days): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - result = decode_cf_datetime(day, units) + result = decode_cf_datetime( + day, units, calendar="standard", time_unit=time_unit + ) assert result == expected[i] + # additional check to see if type/dtypes are correct + if time_unit == "ns": + assert isinstance(result.item(), datetime) + else: + assert result.dtype == np.dtype(f"=M8[{time_unit}]") def test_decode_cf_datetime_non_standard_units() -> None: @@ -211,17 +215,24 @@ def test_decode_cf_datetime_non_iso_strings() -> None: @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: +def test_decode_standard_calendar_inside_timestamp_range( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: import cftime units = "days since 0001-01-01" - times = pd.date_range("2001-04-01-00", end="2001-04-30-23", freq="h") + times = pd.date_range( + "2001-04-01-00", end="2001-04-30-23", unit=time_unit, freq="h" + ) + # to_pydatetime() will return microsecond time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values - expected_dtype = np.dtype("M8[ns]") - - actual = decode_cf_datetime(time, units, calendar=calendar) - assert actual.dtype == expected_dtype + # for cftime we get "us" resolution + # ns resolution is handled by cftime due to the reference date + # being out of bounds, but the times themselves are + # representable with nanosecond resolution. + actual = decode_cf_datetime(time, units, calendar=calendar, time_unit=time_unit) + assert actual.dtype == np.dtype(f"=M8[{time_unit}]") abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, # we could do this check with near microsecond accuracy: @@ -254,7 +265,9 @@ def test_decode_non_standard_calendar_inside_timestamp_range(calendar) -> None: @requires_cftime @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -def test_decode_dates_outside_timestamp_range(calendar) -> None: +def test_decode_dates_outside_timestamp_range( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: from datetime import datetime import cftime @@ -266,30 +279,37 @@ def test_decode_dates_outside_timestamp_range(calendar) -> None: expected = cftime.num2date( time, units, calendar=calendar, only_use_cftime_datetimes=True ) + if calendar == "proleptic_gregorian" and time_unit != "ns": + expected = cftime_to_nptime(expected, time_unit=time_unit) expected_date_type = type(expected[0]) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(time, units, calendar=calendar) + actual = decode_cf_datetime(time, units, calendar=calendar, time_unit=time_unit) assert all(isinstance(value, expected_date_type) for value in actual) abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, # we could do this check with near microsecond accuracy: # https://github.com/Unidata/netcdf4-python/issues/355 - assert (abs_diff <= np.timedelta64(1, "s")).all() + assert (abs_diff <= np.timedelta64(1, "us")).all() @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) @pytest.mark.parametrize("num_time", [735368, [735368], [[735368]]]) def test_decode_standard_calendar_single_element_inside_timestamp_range( - calendar, num_time + calendar, + time_unit: PDDatetimeUnitOptions, + num_time, ) -> None: units = "days since 0001-01-01" with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(num_time, units, calendar=calendar) - assert actual.dtype == np.dtype("M8[ns]") + actual = decode_cf_datetime( + num_time, units, calendar=calendar, time_unit=time_unit + ) + + assert actual.dtype == np.dtype(f"=M8[{time_unit}]") @requires_cftime @@ -327,6 +347,7 @@ def test_decode_single_element_outside_timestamp_range(calendar) -> None: @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) def test_decode_standard_calendar_multidim_time_inside_timestamp_range( calendar, + time_unit: PDDatetimeUnitOptions, ) -> None: import cftime @@ -342,8 +363,10 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( expected1 = times1.values expected2 = times2.values - actual = decode_cf_datetime(mdim_time, units, calendar=calendar) - assert actual.dtype == np.dtype("M8[ns]") + actual = decode_cf_datetime( + mdim_time, units, calendar=calendar, time_unit=time_unit + ) + assert actual.dtype == np.dtype(f"=M8[{time_unit}]") abs_diff1 = abs(actual[:, 0] - expected1) abs_diff2 = abs(actual[:, 1] - expected2) @@ -397,7 +420,9 @@ def test_decode_nonstandard_calendar_multidim_time_inside_timestamp_range( @requires_cftime @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: +def test_decode_multidim_time_outside_timestamp_range( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: from datetime import datetime import cftime @@ -414,11 +439,22 @@ def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: expected1 = cftime.num2date(time1, units, calendar, only_use_cftime_datetimes=True) expected2 = cftime.num2date(time2, units, calendar, only_use_cftime_datetimes=True) + if calendar == "proleptic_gregorian" and time_unit != "ns": + expected1 = cftime_to_nptime(expected1, time_unit=time_unit) + expected2 = cftime_to_nptime(expected2, time_unit=time_unit) + with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(mdim_time, units, calendar=calendar) + actual = decode_cf_datetime( + mdim_time, units, calendar=calendar, time_unit=time_unit + ) - assert actual.dtype == np.dtype("O") + dtype: np.dtype + dtype = np.dtype("O") + if calendar == "proleptic_gregorian" and time_unit != "ns": + dtype = np.dtype(f"=M8[{time_unit}]") + + assert actual.dtype == dtype abs_diff1 = abs(actual[:, 0] - expected1) abs_diff2 = abs(actual[:, 1] - expected2) @@ -507,13 +543,13 @@ def test_cf_datetime_nan(num_dates, units, expected_list) -> None: @requires_cftime -def test_decoded_cf_datetime_array_2d() -> None: +def test_decoded_cf_datetime_array_2d(time_unit: PDDatetimeUnitOptions) -> None: # regression test for GH1229 variable = Variable( ("x", "y"), np.array([[0, 1], [2, 3]]), {"units": "days since 2000-01-01"} ) - result = CFDatetimeCoder().decode(variable) - assert result.dtype == "datetime64[ns]" + result = CFDatetimeCoder(time_unit=time_unit).decode(variable) + assert result.dtype == f"datetime64[{time_unit}]" expected = pd.date_range("2000-01-01", periods=4).values.reshape(2, 2) assert_array_equal(np.asarray(result), expected) @@ -598,7 +634,7 @@ def test_cf_timedelta(timedeltas, units, numbers) -> None: if timedeltas == "NaT": timedeltas = np.timedelta64("NaT", "ns") else: - timedeltas = to_timedelta_unboxed(timedeltas) + timedeltas = pd.to_timedelta(timedeltas).to_numpy() numbers = np.array(numbers) expected = numbers @@ -615,13 +651,14 @@ def test_cf_timedelta(timedeltas, units, numbers) -> None: expected = np.timedelta64("NaT", "ns") actual = decode_cf_timedelta(np.array(np.nan), "days") assert_array_equal(expected, actual) + assert expected.dtype == actual.dtype def test_cf_timedelta_2d() -> None: units = "days" numbers = np.atleast_2d([1, 2, 3]) - timedeltas = np.atleast_2d(to_timedelta_unboxed(["1D", "2D", "3D"])) + timedeltas = np.atleast_2d(pd.to_timedelta(["1D", "2D", "3D"]).to_numpy()) expected = timedeltas actual = decode_cf_timedelta(numbers, units) @@ -629,6 +666,38 @@ def test_cf_timedelta_2d() -> None: assert expected.dtype == actual.dtype +@pytest.mark.parametrize("encoding_unit", FREQUENCIES_TO_ENCODING_UNITS.values()) +def test_decode_cf_timedelta_time_unit(time_unit, encoding_unit) -> None: + encoded = 1 + encoding_unit_as_numpy = _netcdf_to_numpy_timeunit(encoding_unit) + if np.timedelta64(1, time_unit) > np.timedelta64(1, encoding_unit_as_numpy): + expected = np.timedelta64(encoded, encoding_unit_as_numpy) + else: + expected = np.timedelta64(encoded, encoding_unit_as_numpy).astype( + f"timedelta64[{time_unit}]" + ) + result = decode_cf_timedelta(encoded, encoding_unit, time_unit) + assert result == expected + assert result.dtype == expected.dtype + + +def test_decode_cf_timedelta_time_unit_out_of_bounds(time_unit) -> None: + # Define a scale factor that will guarantee overflow with the given + # time_unit. + scale_factor = np.timedelta64(1, time_unit) // np.timedelta64(1, "ns") + encoded = scale_factor * 300 * 365 + with pytest.raises(OutOfBoundsTimedelta): + decode_cf_timedelta(encoded, "days", time_unit) + + +def test_cf_timedelta_roundtrip_large_value(time_unit) -> None: + value = np.timedelta64(np.iinfo(np.int64).max, time_unit) + encoded, units = encode_cf_timedelta(value) + decoded = decode_cf_timedelta(encoded, units, time_unit=time_unit) + assert value == decoded + assert value.dtype == decoded.dtype + + @pytest.mark.parametrize( ["deltas", "expected"], [ @@ -660,7 +729,7 @@ def test_format_cftime_datetime(date_args, expected) -> None: @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -def test_decode_cf(calendar) -> None: +def test_decode_cf(calendar, time_unit: PDDatetimeUnitOptions) -> None: days = [1.0, 2.0, 3.0] # TODO: GH5690 — do we want to allow this type for `coords`? da = DataArray(days, coords=[days], dims=["time"], name="test") @@ -674,15 +743,15 @@ def test_decode_cf(calendar) -> None: with pytest.raises(ValueError): ds = decode_cf(ds) else: - ds = decode_cf(ds) + ds = decode_cf(ds, decode_times=CFDatetimeCoder(time_unit=time_unit)) if calendar not in _STANDARD_CALENDARS: assert ds.test.dtype == np.dtype("O") else: - assert ds.test.dtype == np.dtype("=M8[ns]") + assert ds.test.dtype == np.dtype(f"=M8[{time_unit}]") -def test_decode_cf_time_bounds() -> None: +def test_decode_cf_time_bounds(time_unit: PDDatetimeUnitOptions) -> None: da = DataArray( np.arange(6, dtype="int64").reshape((3, 2)), coords={"time": [1, 2, 3]}, @@ -703,8 +772,8 @@ def test_decode_cf_time_bounds() -> None: "units": "days since 2001-01", "calendar": "standard", } - dsc = decode_cf(ds) - assert dsc.time_bnds.dtype == np.dtype("=M8[ns]") + dsc = decode_cf(ds, decode_times=CFDatetimeCoder(time_unit=time_unit)) + assert dsc.time_bnds.dtype == np.dtype(f"=M8[{time_unit}]") dsc = decode_cf(ds, decode_times=False) assert dsc.time_bnds.dtype == np.dtype("int64") @@ -921,8 +990,8 @@ def test_use_cftime_default_standard_calendar_in_range(calendar) -> None: @requires_cftime -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +@pytest.mark.parametrize("units_year", [1500, 1580]) def test_use_cftime_default_standard_calendar_out_of_range( calendar, units_year ) -> None: @@ -942,7 +1011,9 @@ def test_use_cftime_default_standard_calendar_out_of_range( @requires_cftime @pytest.mark.parametrize("calendar", _NON_STANDARD_CALENDARS) @pytest.mark.parametrize("units_year", [1500, 2000, 2500]) -def test_use_cftime_default_non_standard_calendar(calendar, units_year) -> None: +def test_use_cftime_default_non_standard_calendar( + calendar, units_year, time_unit +) -> None: from cftime import num2date numerical_dates = [0, 1] @@ -951,9 +1022,18 @@ def test_use_cftime_default_non_standard_calendar(calendar, units_year) -> None: numerical_dates, units, calendar, only_use_cftime_datetimes=True ) - with assert_no_warnings(): - result = decode_cf_datetime(numerical_dates, units, calendar) - np.testing.assert_array_equal(result, expected) + if time_unit == "ns" and units_year == 2500: + with pytest.warns(SerializationWarning, match="Unable to decode time axis"): + result = decode_cf_datetime( + numerical_dates, units, calendar, time_unit=time_unit + ) + else: + with assert_no_warnings(): + result = decode_cf_datetime( + numerical_dates, units, calendar, time_unit=time_unit + ) + + np.testing.assert_array_equal(result, expected) @requires_cftime @@ -984,8 +1064,8 @@ def test_use_cftime_false_standard_calendar_in_range(calendar) -> None: np.testing.assert_array_equal(result, expected) -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +@pytest.mark.parametrize("units_year", [1500, 1582]) def test_use_cftime_false_standard_calendar_out_of_range(calendar, units_year) -> None: numerical_dates = [0, 1] units = f"days since {units_year}-01-01" @@ -1056,14 +1136,18 @@ def test_encode_cf_datetime_defaults_to_correct_dtype( @pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys()) -def test_encode_decode_roundtrip_datetime64(freq) -> None: +def test_encode_decode_roundtrip_datetime64( + freq, time_unit: PDDatetimeUnitOptions +) -> None: # See GH 4045. Prior to GH 4684 this test would fail for frequencies of # "s", "ms", "us", and "ns". initial_time = pd.date_range("1678-01-01", periods=1) times = initial_time.append(pd.date_range("1968", periods=2, freq=freq)) variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) - decoded = conventions.decode_cf_variable("time", encoded) + decoded = conventions.decode_cf_variable( + "time", encoded, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert_equal(variable, decoded) @@ -1103,14 +1187,42 @@ def test__encode_datetime_with_cftime() -> None: np.testing.assert_equal(result, expected) +@requires_cftime +def test_encode_decode_cf_datetime_outofbounds_warnings( + time_unit: PDDatetimeUnitOptions, +) -> None: + import cftime + + if time_unit == "ns": + pytest.skip("does not work work out of bounds datetimes") + dates = np.array(["0001-01-01", "2001-01-01"], dtype=f"datetime64[{time_unit}]") + cfdates = np.array( + [ + cftime.datetime(t0.year, t0.month, t0.day, calendar="gregorian") + for t0 in dates.astype(datetime) + ] + ) + with pytest.warns( + SerializationWarning, match="Unable to encode numpy.datetime64 objects" + ): + encoded = encode_cf_datetime(dates, "seconds since 2000-01-01", "standard") + with pytest.warns(SerializationWarning, match="Unable to decode time axis"): + decoded = decode_cf_datetime(*encoded) + np.testing.assert_equal(decoded, cfdates) + + @pytest.mark.parametrize("calendar", ["gregorian", "Gregorian", "GREGORIAN"]) -def test_decode_encode_roundtrip_with_non_lowercase_letters(calendar) -> None: +def test_decode_encode_roundtrip_with_non_lowercase_letters( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: # See GH 5093. times = [0, 1] units = "days since 2000-01-01" attrs = {"calendar": calendar, "units": units} variable = Variable(["time"], times, attrs) - decoded = conventions.decode_cf_variable("time", variable) + decoded = conventions.decode_cf_variable( + "time", variable, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) encoded = conventions.encode_cf_variable(decoded) # Previously this would erroneously be an array of cftime.datetime @@ -1214,7 +1326,10 @@ def test_decode_float_datetime(): np.testing.assert_equal(actual, expected) -def test_decode_float_datetime_with_decimals() -> None: +@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) +def test_decode_float_datetime_with_decimals( + time_unit: PDDatetimeUnitOptions, +) -> None: # test resolution enhancement for floats values = np.array([0, 0.125, 0.25, 0.375, 0.75, 1.0], dtype="float32") expected = np.array( @@ -1226,16 +1341,32 @@ def test_decode_float_datetime_with_decimals() -> None: "2000-01-01T00:00:00.750", "2000-01-01T00:00:01.000", ], - dtype="=M8[ns]", + dtype=f"=M8[{time_unit}]", ) units = "seconds since 2000-01-01" calendar = "standard" - actual = decode_cf_datetime(values, units, calendar) + actual = decode_cf_datetime(values, units, calendar, time_unit=time_unit) assert actual.dtype == expected.dtype np.testing.assert_equal(actual, expected) +@pytest.mark.parametrize( + "time_unit, num", [("s", 0.123), ("ms", 0.1234), ("us", 0.1234567)] +) +def test_coding_float_datetime_warning( + time_unit: PDDatetimeUnitOptions, num: float +) -> None: + units = "seconds since 2000-01-01" + calendar = "standard" + values = np.array([num], dtype="float32") + with pytest.warns( + SerializationWarning, + match=f"Can't decode floating point datetime to {time_unit!r}", + ): + decode_cf_datetime(values, units, calendar, time_unit=time_unit) + + @requires_cftime def test_scalar_unit() -> None: # test that a scalar units (often NaN when using to_netcdf) does not raise an error @@ -1259,7 +1390,7 @@ def test_contains_cftime_lazy() -> None: @pytest.mark.parametrize( - "timestr, timeunit, dtype, fill_value, use_encoding", + "timestr, format, dtype, fill_value, use_encoding", [ ("1677-09-21T00:12:43.145224193", "ns", np.int64, 20, True), ("1970-09-21T00:12:44.145224808", "ns", np.float64, 1e30, True), @@ -1278,14 +1409,15 @@ def test_contains_cftime_lazy() -> None: ) def test_roundtrip_datetime64_nanosecond_precision( timestr: str, - timeunit: Literal["ns", "us"], + format: Literal["ns", "us"], dtype: np.typing.DTypeLike, fill_value: int | float | None, use_encoding: bool, + time_unit: PDDatetimeUnitOptions, ) -> None: # test for GH7817 - time = np.datetime64(timestr, timeunit) - times = [np.datetime64("1970-01-01T00:00:00", timeunit), np.datetime64("NaT"), time] + time = np.datetime64(timestr, format) + times = [np.datetime64("1970-01-01T00:00:00", format), np.datetime64("NaT"), time] if use_encoding: encoding = dict(dtype=dtype, _FillValue=fill_value) @@ -1293,28 +1425,37 @@ def test_roundtrip_datetime64_nanosecond_precision( encoding = {} var = Variable(["time"], times, encoding=encoding) - assert var.dtype == np.dtype("=M8[ns]") + assert var.dtype == np.dtype(f"=M8[{format}]") encoded_var = conventions.encode_cf_variable(var) assert ( encoded_var.attrs["units"] - == f"{_numpy_to_netcdf_timeunit(timeunit)} since 1970-01-01 00:00:00" + == f"{_numpy_to_netcdf_timeunit(format)} since 1970-01-01 00:00:00" ) assert encoded_var.attrs["calendar"] == "proleptic_gregorian" assert encoded_var.data.dtype == dtype + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) - decoded_var = conventions.decode_cf_variable("foo", encoded_var) - assert decoded_var.dtype == np.dtype("=M8[ns]") + result_unit = ( + format + if np.timedelta64(1, format) <= np.timedelta64(1, time_unit) + else time_unit + ) + assert decoded_var.dtype == np.dtype(f"=M8[{result_unit}]") assert ( decoded_var.encoding["units"] - == f"{_numpy_to_netcdf_timeunit(timeunit)} since 1970-01-01 00:00:00" + == f"{_numpy_to_netcdf_timeunit(format)} since 1970-01-01 00:00:00" ) assert decoded_var.encoding["dtype"] == dtype assert decoded_var.encoding["calendar"] == "proleptic_gregorian" assert_identical(var, decoded_var) -def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: +def test_roundtrip_datetime64_nanosecond_precision_warning( + time_unit: PDDatetimeUnitOptions, +) -> None: # test warning if times can't be serialized faithfully times = [ np.datetime64("1970-01-01T00:01:00", "ns"), @@ -1346,7 +1487,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: assert encoded_var.attrs["units"] == new_units assert encoded_var.attrs["_FillValue"] == 20 - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert_identical(var, decoded_var) encoding = dict(dtype="float64", _FillValue=20, units=units) @@ -1358,7 +1501,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: assert encoded_var.attrs["units"] == units assert encoded_var.attrs["_FillValue"] == 20.0 - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert_identical(var, decoded_var) encoding = dict(dtype="int64", _FillValue=20, units=new_units) @@ -1370,7 +1515,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: assert encoded_var.attrs["units"] == new_units assert encoded_var.attrs["_FillValue"] == 20 - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert_identical(var, decoded_var) @@ -1379,7 +1526,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: [(np.int64, 20), (np.int64, np.iinfo(np.int64).min), (np.float64, 1e30)], ) def test_roundtrip_timedelta64_nanosecond_precision( - dtype: np.typing.DTypeLike, fill_value: int | float + dtype: np.typing.DTypeLike, + fill_value: int | float, + time_unit: PDDatetimeUnitOptions, ) -> None: # test for GH7942 one_day = np.timedelta64(1, "ns") @@ -1392,7 +1541,9 @@ def test_roundtrip_timedelta64_nanosecond_precision( var = Variable(["time"], timedelta_values, encoding=encoding) encoded_var = conventions.encode_cf_variable(var) - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert_identical(var, decoded_var) @@ -1587,6 +1738,7 @@ def test_encode_cf_datetime_casting_value_error(use_cftime, use_dask) -> None: with pytest.warns(UserWarning, match="Times can't be serialized"): encoded = conventions.encode_cf_variable(variable) assert encoded.attrs["units"] == "hours since 2000-01-01" + decoded = conventions.decode_cf_variable("name", encoded) assert_equal(variable, decoded) else: diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index fd9f6ef41ea..1d80d874df0 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -2343,6 +2343,20 @@ def test_where_attrs() -> None: ), id="datetime", ), + pytest.param( + # Force a non-ns unit for the coordinate, make sure we convert to `ns` + # for backwards compatibility at the moment. This can be relaxed in the future. + xr.DataArray( + pd.date_range("1970-01-01", freq="s", periods=3, unit="s"), dims="x" + ), + xr.DataArray([0, 1], dims="degree", coords={"degree": [0, 1]}), + xr.DataArray( + [0, 1e9, 2e9], + dims="x", + coords={"x": pd.date_range("1970-01-01", freq="s", periods=3)}, + ), + id="datetime-non-ns", + ), pytest.param( xr.DataArray( np.array([1000, 2000, 3000], dtype="timedelta64[ns]"), dims="x" @@ -2457,6 +2471,14 @@ def test_polyval_degree_dim_checks() -> None: xr.DataArray(pd.date_range("1970-01-01", freq="ns", periods=3), dims="x"), id="datetime", ), + # Force a non-ns unit for the coordinate, make sure we convert to `ns` in both polyfit & polval + # for backwards compatibility at the moment. This can be relaxed in the future. + pytest.param( + xr.DataArray( + pd.date_range("1970-01-01", freq="s", unit="s", periods=3), dims="x" + ), + id="datetime-non-ns", + ), pytest.param( xr.DataArray(np.array([0, 1, 2], dtype="timedelta64[ns]"), dims="x"), id="timedelta", diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index c3caab4e125..9e8e06fc1ee 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -317,7 +317,6 @@ def test_concat_multiple_datasets_with_multiple_missing_variables() -> None: assert_identical(actual, expected) -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_concat_type_of_missing_fill() -> None: datasets = create_typed_datasets(2, seed=123) expected1 = concat(datasets, dim="day", fill_value=dtypes.NA) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 7616f12957f..346ad1c908b 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -18,6 +18,7 @@ ) from xarray.backends.common import WritableCFDataStore from xarray.backends.memory import InMemoryDataStore +from xarray.coders import CFDatetimeCoder from xarray.conventions import decode_cf from xarray.testing import assert_identical from xarray.tests import ( @@ -213,7 +214,6 @@ def test_deterministic_coords_encoding(self) -> None: vars, attrs = conventions.encode_dataset_coordinates(ds) assert attrs["coordinates"] == "bar baz" - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_emit_coordinates_attribute_in_attrs(self) -> None: orig = Dataset( {"a": 1, "b": 1}, @@ -231,7 +231,6 @@ def test_emit_coordinates_attribute_in_attrs(self) -> None: assert enc["b"].attrs.get("coordinates") == "t" assert "coordinates" not in enc["b"].encoding - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_emit_coordinates_attribute_in_encoding(self) -> None: orig = Dataset( {"a": 1, "b": 1}, @@ -437,7 +436,8 @@ def test_invalid_timedelta_units_do_not_decode(self, decode_times) -> None: assert_identical(expected, decode_cf(ds, decode_times=decode_times)) @requires_cftime - def test_dataset_repr_with_netcdf4_datetimes(self) -> None: + @pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) + def test_dataset_repr_with_netcdf4_datetimes(self, time_unit) -> None: # regression test for #347 attrs = {"units": "days since 0001-01-01", "calendar": "noleap"} with warnings.catch_warnings(): @@ -448,8 +448,9 @@ def test_dataset_repr_with_netcdf4_datetimes(self) -> None: attrs = {"units": "days since 1900-01-01"} ds = decode_cf( Dataset({"time": ("time", [0, 1], attrs)}), + decode_times=CFDatetimeCoder(time_unit=time_unit), ) - assert "(time) datetime64[ns]" in repr(ds) + assert f"(time) datetime64[{time_unit}]" in repr(ds) @requires_cftime def test_decode_cf_datetime_transition_to_invalid(self) -> None: @@ -508,14 +509,18 @@ def test_decode_dask_times(self) -> None: conventions.decode_cf(original).chunk(), ) - def test_decode_cf_time_kwargs(self) -> None: + @pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) + def test_decode_cf_time_kwargs(self, time_unit) -> None: + # todo: if we set timedelta attrs "units": "days" + # this errors on the last decode_cf wrt to the lazy_elemwise_func + # trying to convert twice ds = Dataset.from_dict( { "coords": { "timedelta": { "data": np.array([1, 2, 3], dtype="int64"), "dims": "timedelta", - "attrs": {"units": "days"}, + "attrs": {"units": "seconds"}, }, "time": { "data": np.array([1, 2, 3], dtype="int64"), @@ -530,15 +535,21 @@ def test_decode_cf_time_kwargs(self) -> None: } ) - dsc = conventions.decode_cf(ds) + dsc = conventions.decode_cf( + ds, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert dsc.timedelta.dtype == np.dtype("m8[ns]") - assert dsc.time.dtype == np.dtype("M8[ns]") + assert dsc.time.dtype == np.dtype(f"M8[{time_unit}]") dsc = conventions.decode_cf(ds, decode_times=False) assert dsc.timedelta.dtype == np.dtype("int64") assert dsc.time.dtype == np.dtype("int64") - dsc = conventions.decode_cf(ds, decode_times=True, decode_timedelta=False) + dsc = conventions.decode_cf( + ds, + decode_times=CFDatetimeCoder(time_unit=time_unit), + decode_timedelta=False, + ) assert dsc.timedelta.dtype == np.dtype("int64") - assert dsc.time.dtype == np.dtype("M8[ns]") + assert dsc.time.dtype == np.dtype(f"M8[{time_unit}]") dsc = conventions.decode_cf(ds, decode_times=False, decode_timedelta=True) assert dsc.timedelta.dtype == np.dtype("m8[ns]") assert dsc.time.dtype == np.dtype("int64") diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 7f6673628aa..c94eefd74ea 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3661,7 +3661,6 @@ def test_to_and_from_dict( actual_no_data = da.to_dict(data=False, encoding=encoding) assert expected_no_data == actual_no_data - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_to_and_from_dict_with_time_dim(self) -> None: x = np.random.randn(10, 3) t = pd.date_range("20130101", periods=10) @@ -3670,7 +3669,6 @@ def test_to_and_from_dict_with_time_dim(self) -> None: roundtripped = DataArray.from_dict(da.to_dict()) assert_identical(da, roundtripped) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_to_and_from_dict_with_nan_nat(self) -> None: y = np.random.randn(10, 3) y[2] = np.nan diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index d92b26fcee5..8a90a05a4e3 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -105,24 +105,24 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: lon = [0, 1, 2] nt1 = 3 nt2 = 2 - time1 = pd.date_range("2000-01-01", periods=nt1) - time2 = pd.date_range("2000-02-01", periods=nt2) + time1 = pd.date_range("2000-01-01", periods=nt1).as_unit("ns") + time2 = pd.date_range("2000-02-01", periods=nt2).as_unit("ns") string_var = np.array(["a", "bc", "def"], dtype=object) string_var_to_append = np.array(["asdf", "asdfg"], dtype=object) string_var_fixed_length = np.array(["aa", "bb", "cc"], dtype="|S2") string_var_fixed_length_to_append = np.array(["dd", "ee"], dtype="|S2") unicode_var = np.array(["áó", "áó", "áó"]) datetime_var = np.array( - ["2019-01-01", "2019-01-02", "2019-01-03"], dtype="datetime64[s]" + ["2019-01-01", "2019-01-02", "2019-01-03"], dtype="datetime64[ns]" ) datetime_var_to_append = np.array( - ["2019-01-04", "2019-01-05"], dtype="datetime64[s]" + ["2019-01-04", "2019-01-05"], dtype="datetime64[ns]" ) bool_var = np.array([True, False, True], dtype=bool) bool_var_to_append = np.array([False, True], dtype=bool) with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Converting non-nanosecond") + warnings.filterwarnings("ignore", "Converting non-default") ds = xr.Dataset( data_vars={ "da": xr.DataArray( @@ -289,7 +289,7 @@ def test_repr(self) -> None: Coordinates: * dim2 (dim2) float64 72B 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 * dim3 (dim3) {} 40B 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' - * time (time) datetime64[ns] 160B 2000-01-01 2000-01-02 ... 2000-01-20 + * time (time) datetime64[{}] 160B 2000-01-01 2000-01-02 ... 2000-01-20 numbers (dim3) int64 80B 0 1 2 0 0 1 1 2 2 3 Dimensions without coordinates: dim1 Data variables: @@ -297,7 +297,10 @@ def test_repr(self) -> None: var2 (dim1, dim2) float64 576B 0.953 1.52 1.704 ... 0.1347 -0.6423 var3 (dim3, dim1) float64 640B 0.4107 0.9941 0.1665 ... 0.716 1.555 Attributes: - foo: bar""".format(data["dim3"].dtype) + foo: bar""".format( + data["dim3"].dtype, + "ns", + ) ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) @@ -496,7 +499,6 @@ def test_constructor_1d(self) -> None: actual = Dataset({"x": [5, 6, 7, 8, 9]}) assert_identical(expected, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_constructor_0d(self) -> None: expected = Dataset({"x": ([], 1)}) for arg in [1, np.array(1), expected["x"]]: @@ -3546,9 +3548,9 @@ def test_expand_dims_create_index_from_iterable(self): def test_expand_dims_non_nanosecond_conversion(self) -> None: # Regression test for https://github.com/pydata/xarray/issues/7493#issuecomment-1953091000 - with pytest.warns(UserWarning, match="non-nanosecond precision"): - ds = Dataset().expand_dims({"time": [np.datetime64("2018-01-01", "s")]}) - assert ds.time.dtype == np.dtype("datetime64[ns]") + # todo: test still needed? + ds = Dataset().expand_dims({"time": [np.datetime64("2018-01-01", "m")]}) + assert ds.time.dtype == np.dtype("datetime64[s]") def test_set_index(self) -> None: expected = create_test_multiindex() @@ -6067,7 +6069,6 @@ def test_dataset_math_auto_align(self) -> None: expected = ds + other.reindex_like(ds) assert_identical(expected, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_dataset_math_errors(self) -> None: ds = self.make_example_math_dataset() @@ -7207,7 +7208,6 @@ def test_differentiate(dask, edge_order) -> None: da.differentiate("x2d") -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") @pytest.mark.parametrize("dask", [True, False]) def test_differentiate_datetime(dask) -> None: rs = np.random.default_rng(42) @@ -7402,7 +7402,6 @@ def test_cumulative_integrate(dask) -> None: da.cumulative_integrate("x2d") -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") @pytest.mark.parametrize("dask", [True, False]) @pytest.mark.parametrize("which_datetime", ["np", "cftime"]) def test_trapezoid_datetime(dask, which_datetime) -> None: diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 7dd6cdb622d..3c7f46f5a02 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -638,7 +638,6 @@ def test_groupby_repr_datetime(obj) -> None: @pytest.mark.filterwarnings("ignore:No index created for dimension id:UserWarning") -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") @pytest.mark.filterwarnings("ignore:invalid value encountered in divide:RuntimeWarning") @pytest.mark.parametrize("shuffle", [True, False]) @pytest.mark.parametrize( @@ -2200,9 +2199,8 @@ def test_upsample_interpolate(self) -> None: assert_allclose(expected, actual, rtol=1e-16) @requires_scipy - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_upsample_interpolate_bug_2197(self) -> None: - dates = pd.date_range("2007-02-01", "2007-03-01", freq="D") + dates = pd.date_range("2007-02-01", "2007-03-01", freq="D", unit="s") da = xr.DataArray(np.arange(len(dates)), [("time", dates)]) result = da.resample(time="ME").interpolate("linear") expected_times = np.array( diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index da17a908eff..b2171f31c33 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -718,7 +718,6 @@ def test_interp_like() -> None: pytest.param("2000-01-01T12:00", 0.5, marks=pytest.mark.xfail), ], ) -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_datetime(x_new, expected) -> None: da = xr.DataArray( np.arange(24), diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 8e00b943de8..0a05451cb85 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -2962,7 +2962,6 @@ def test_datetime_plot1d(self) -> None: # mpl.dates.AutoDateLocator passes and no other subclasses: assert type(ax.xaxis.get_major_locator()) is mpl.dates.AutoDateLocator - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_datetime_plot2d(self) -> None: # Test that matplotlib-native datetime works: da = DataArray( diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index c3de2253186..4cf4204649d 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -37,7 +37,6 @@ assert_identical, assert_no_warnings, has_dask_ge_2024_11_0, - has_pandas_3, raise_if_dask_computes, requires_bottleneck, requires_cupy, @@ -201,24 +200,24 @@ def test_index_0d_string(self): x = self.cls(["x"], [value]) self._assertIndexedLikeNDArray(x, value, dtype) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_index_0d_datetime(self): d = datetime(2000, 1, 1) x = self.cls(["x"], [d]) self._assertIndexedLikeNDArray(x, np.datetime64(d)) x = self.cls(["x"], [np.datetime64(d)]) - self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[ns]") + self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[us]") x = self.cls(["x"], pd.DatetimeIndex([d])) self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[ns]") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_index_0d_timedelta64(self): td = timedelta(hours=1) - + # todo: discussion needed x = self.cls(["x"], [np.timedelta64(td)]) - self._assertIndexedLikeNDArray(x, np.timedelta64(td), "timedelta64[ns]") + self._assertIndexedLikeNDArray( + x, np.timedelta64(td), np.dtype("timedelta64[us]") + ) x = self.cls(["x"], pd.to_timedelta([td])) self._assertIndexedLikeNDArray(x, np.timedelta64(td), "timedelta64[ns]") @@ -254,7 +253,6 @@ def test_0d_object_array_with_list(self): assert_array_equal(x[0].data, listarray.squeeze()) assert_array_equal(x.squeeze().data, listarray.squeeze()) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_index_and_concat_datetime(self): # regression test for #125 date_range = pd.date_range("2011-09-01", periods=10) @@ -275,53 +273,49 @@ def test_0d_time_data(self): expected = np.datetime64("2000-01-01", "ns") assert x[0].values == expected - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_datetime64_conversion(self): - times = pd.date_range("2000-01-01", periods=3) - for values in [ - times, - times.values, - times.values.astype("datetime64[s]"), - times.to_pydatetime(), - ]: - v = self.cls(["t"], values) - assert v.dtype == np.dtype("datetime64[ns]") - assert_array_equal(v.values, times.values) - assert v.values.dtype == np.dtype("datetime64[ns]") - - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_timedelta64_conversion(self): - times = pd.timedelta_range(start=0, periods=3) - for values in [ - times, - times.values, - times.values.astype("timedelta64[s]"), - times.to_pytimedelta(), - ]: - v = self.cls(["t"], values) - assert v.dtype == np.dtype("timedelta64[ns]") - assert_array_equal(v.values, times.values) - assert v.values.dtype == np.dtype("timedelta64[ns]") + dt64_data = pd.date_range("1970-01-01", periods=3) + + @pytest.mark.parametrize( + "values, unit", + [ + (dt64_data, "ns"), + (dt64_data.values, "ns"), + (dt64_data.values.astype("datetime64[m]"), "s"), + (dt64_data.values.astype("datetime64[s]"), "s"), + (dt64_data.values.astype("datetime64[ps]"), "ns"), + (dt64_data.to_pydatetime(), "ns"), + ], + ) + def test_datetime64_conversion(self, values, unit): + v = self.cls(["t"], values) + assert v.dtype == np.dtype(f"datetime64[{unit}]") + assert_array_equal(v.values, self.dt64_data.values) + assert v.values.dtype == np.dtype(f"datetime64[{unit}]") + + td64_data = pd.timedelta_range(start=0, periods=3) + + @pytest.mark.parametrize( + "values, unit", + [ + (td64_data, "ns"), + (td64_data.values, "ns"), + (td64_data.values.astype("timedelta64[m]"), "s"), + (td64_data.values.astype("timedelta64[s]"), "s"), + (td64_data.values.astype("timedelta64[ps]"), "ns"), + (td64_data.to_pytimedelta(), "ns"), + ], + ) + def test_timedelta64_conversion(self, values, unit): + v = self.cls(["t"], values) + assert v.dtype == np.dtype(f"timedelta64[{unit}]") + assert_array_equal(v.values, self.td64_data.values) + assert v.values.dtype == np.dtype(f"timedelta64[{unit}]") def test_object_conversion(self): data = np.arange(5).astype(str).astype(object) actual = self.cls("x", data) assert actual.dtype == data.dtype - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_datetime64_valid_range(self): - data = np.datetime64("1250-01-01", "us") - pderror = pd.errors.OutOfBoundsDatetime - with pytest.raises(pderror, match=r"Out of bounds nanosecond"): - self.cls(["t"], [data]) - - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_timedelta64_valid_range(self): - data = np.timedelta64("200000", "D") - pderror = pd.errors.OutOfBoundsTimedelta - with pytest.raises(pderror, match=r"Cannot convert"): - self.cls(["t"], [data]) - def test_pandas_data(self): v = self.cls(["x"], pd.Series([0, 1, 2], index=[3, 2, 1])) assert_identical(v, v[[0, 1, 2]]) @@ -1073,31 +1067,36 @@ def test_numpy_same_methods(self): v = IndexVariable("x", np.arange(5)) assert 2 == v.searchsorted(2) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_datetime64_conversion_scalar(self): - expected = np.datetime64("2000-01-01", "ns") - for values in [ - np.datetime64("2000-01-01"), - pd.Timestamp("2000-01-01T00"), - datetime(2000, 1, 1), - ]: - v = Variable([], values) - assert v.dtype == np.dtype("datetime64[ns]") - assert v.values == expected - assert v.values.dtype == np.dtype("datetime64[ns]") - - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_timedelta64_conversion_scalar(self): - expected = np.timedelta64(24 * 60 * 60 * 10**9, "ns") - for values in [ - np.timedelta64(1, "D"), - pd.Timedelta("1 day"), - timedelta(days=1), - ]: - v = Variable([], values) - assert v.dtype == np.dtype("timedelta64[ns]") - assert v.values == expected - assert v.values.dtype == np.dtype("timedelta64[ns]") + @pytest.mark.parametrize( + "values, unit", + [ + (np.datetime64("2000-01-01"), "s"), + (pd.Timestamp("2000-01-01T00"), "ns"), + (datetime(2000, 1, 1), "ns"), + (np.datetime64("2000-01-01T00:00:00.1234567891"), "ns"), + ], + ) + def test_datetime64_conversion_scalar(self, values, unit): + v = Variable([], values) + assert v.dtype == np.dtype(f"datetime64[{unit}]") + assert np.issubdtype(v.values, "datetime64") + assert v.values.dtype == np.dtype(f"datetime64[{unit}]") + + @pytest.mark.parametrize( + "values, unit", + [ + (np.timedelta64(1, "m"), "s"), + (np.timedelta64(1, "D"), "s"), + (np.timedelta64(1001, "ps"), "ns"), + (pd.Timedelta("1 day"), "ns"), + (timedelta(days=1), "ns"), + ], + ) + def test_timedelta64_conversion_scalar(self, values, unit): + v = Variable([], values) + assert v.dtype == np.dtype(f"timedelta64[{unit}]") + assert np.issubdtype(v.values, "timedelta64") + assert v.values.dtype == np.dtype(f"timedelta64[{unit}]") def test_0d_str(self): v = Variable([], "foo") @@ -1108,18 +1107,19 @@ def test_0d_str(self): assert v.dtype == np.dtype("S3") assert v.values == "foo".encode("ascii") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_0d_datetime(self): v = Variable([], pd.Timestamp("2000-01-01")) assert v.dtype == np.dtype("datetime64[ns]") assert v.values == np.datetime64("2000-01-01", "ns") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_0d_timedelta(self): - for td in [pd.to_timedelta("1s"), np.timedelta64(1, "s")]: - v = Variable([], td) - assert v.dtype == np.dtype("timedelta64[ns]") - assert v.values == np.timedelta64(10**9, "ns") + @pytest.mark.parametrize( + "values, unit", [(pd.to_timedelta("1s"), "ns"), (np.timedelta64(1, "s"), "s")] + ) + def test_0d_timedelta(self, values, unit): + # todo: check, if this test is OK + v = Variable([], values) + assert v.dtype == np.dtype(f"timedelta64[{unit}]") + assert v.values == np.timedelta64(10**9, "ns") def test_equals_and_identical(self): d = np.random.rand(10, 3) @@ -1559,7 +1559,6 @@ def test_transpose(self): v.transpose(..., "not_a_dim", missing_dims="warn") assert_identical(expected_ell, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_transpose_0d(self): for value in [ 3.5, @@ -1955,7 +1954,6 @@ def test_big_endian_reduce(self): expected = Variable([], 5) assert_identical(expected, v.sum()) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_reduce_funcs(self): v = Variable("x", np.array([1, np.nan, 2, 3])) assert_identical(v.mean(), Variable([], 2)) @@ -2636,19 +2634,18 @@ def test_masked_array(self): assert_array_equal(expected, actual) assert actual.dtype == expected.dtype - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_datetime(self): expected = np.datetime64("2000-01-01") actual = as_compatible_data(expected) assert expected == actual assert np.ndarray is type(actual) - assert np.dtype("datetime64[ns]") == actual.dtype + assert np.dtype("datetime64[s]") == actual.dtype expected = np.array([np.datetime64("2000-01-01")]) actual = as_compatible_data(expected) assert np.asarray(expected) == actual assert np.ndarray is type(actual) - assert np.dtype("datetime64[ns]") == actual.dtype + assert np.dtype("datetime64[s]") == actual.dtype expected = np.array([np.datetime64("2000-01-01", "ns")]) actual = as_compatible_data(expected) @@ -2672,7 +2669,7 @@ def test_tz_datetime(self) -> None: warnings.simplefilter("ignore") actual: T_DuckArray = as_compatible_data(times_s) assert actual.array == times_s - assert actual.array.dtype == pd.DatetimeTZDtype("ns", tz) + assert actual.array.dtype == pd.DatetimeTZDtype("s", tz) # type: ignore[arg-type] series = pd.Series(times_s) with warnings.catch_warnings(): @@ -2680,7 +2677,7 @@ def test_tz_datetime(self) -> None: actual2: T_DuckArray = as_compatible_data(series) np.testing.assert_array_equal(actual2, np.asarray(series.values)) - assert actual2.dtype == np.dtype("datetime64[ns]") + assert actual2.dtype == np.dtype("datetime64[s]") def test_full_like(self) -> None: # For more thorough tests, see test_variable.py @@ -2976,37 +2973,31 @@ def test_from_pint_wrapping_dask(self, Var): @pytest.mark.parametrize( - ("values", "warns"), + ("values", "unit"), [ - (np.datetime64("2000-01-01", "ns"), False), - (np.datetime64("2000-01-01", "s"), True), - (np.array([np.datetime64("2000-01-01", "ns")]), False), - (np.array([np.datetime64("2000-01-01", "s")]), True), - (pd.date_range("2000", periods=1), False), - (datetime(2000, 1, 1), has_pandas_3), - (np.array([datetime(2000, 1, 1)]), has_pandas_3), - (pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")), False), + (np.datetime64("2000-01-01", "ns"), "ns"), + (np.datetime64("2000-01-01", "s"), "s"), + (np.array([np.datetime64("2000-01-01", "ns")]), "ns"), + (np.array([np.datetime64("2000-01-01", "s")]), "s"), + (pd.date_range("2000", periods=1), "ns"), + (datetime(2000, 1, 1), "ns"), + (np.array([datetime(2000, 1, 1)]), "ns"), + (pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")), "ns"), ( pd.Series( pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")) ), - False, + "ns", ), ], ids=lambda x: f"{x}", ) -def test_datetime_conversion_warning(values, warns) -> None: +def test_datetime_conversion(values, unit) -> None: + # todo: check for redundancy (suggested per review) dims = ["time"] if isinstance(values, np.ndarray | pd.Index | pd.Series) else [] - if warns: - with pytest.warns(UserWarning, match="non-nanosecond precision datetime"): - var = Variable(dims, values) - else: - with warnings.catch_warnings(): - warnings.simplefilter("error") - var = Variable(dims, values) - + var = Variable(dims, values) if var.dtype.kind == "M": - assert var.dtype == np.dtype("datetime64[ns]") + assert var.dtype == np.dtype(f"datetime64[{unit}]") else: # The only case where a non-datetime64 dtype can occur currently is in # the case that the variable is backed by a timezone-aware @@ -3044,65 +3035,35 @@ def test_datetime_conversion_warning(values, warns) -> None: def test_pandas_two_only_datetime_conversion_warnings( data: pd.DatetimeIndex | pd.Series, dtype: str | pd.DatetimeTZDtype ) -> None: - with pytest.warns(UserWarning, match="non-nanosecond precision datetime"): - var = Variable(["time"], data.astype(dtype)) # type: ignore[arg-type] + # todo: check for redundancy (suggested per review) + var = Variable(["time"], data.astype(dtype)) # type: ignore[arg-type] if var.dtype.kind == "M": - assert var.dtype == np.dtype("datetime64[ns]") + assert var.dtype == np.dtype("datetime64[s]") else: # The only case where a non-datetime64 dtype can occur currently is in # the case that the variable is backed by a timezone-aware # DatetimeIndex, and thus is hidden within the PandasIndexingAdapter class. assert isinstance(var._data, PandasIndexingAdapter) - assert var._data.array.dtype == pd.DatetimeTZDtype("ns", tz_ny) + assert var._data.array.dtype == pd.DatetimeTZDtype("s", tz_ny) @pytest.mark.parametrize( - ("values", "warns"), + ("values", "unit"), [ - (np.timedelta64(10, "ns"), False), - (np.timedelta64(10, "s"), True), - (np.array([np.timedelta64(10, "ns")]), False), - (np.array([np.timedelta64(10, "s")]), True), - (pd.timedelta_range("1", periods=1), False), - (timedelta(days=1), False), - (np.array([timedelta(days=1)]), False), + (np.timedelta64(10, "ns"), "ns"), + (np.timedelta64(10, "s"), "s"), + (np.array([np.timedelta64(10, "ns")]), "ns"), + (np.array([np.timedelta64(10, "s")]), "s"), + (pd.timedelta_range("1", periods=1), "ns"), + (timedelta(days=1), "ns"), + (np.array([timedelta(days=1)]), "ns"), + (pd.timedelta_range("1", periods=1).astype("timedelta64[s]"), "s"), ], ids=lambda x: f"{x}", ) -def test_timedelta_conversion_warning(values, warns) -> None: +def test_timedelta_conversion(values, unit) -> None: + # todo: check for redundancy dims = ["time"] if isinstance(values, np.ndarray | pd.Index) else [] - if warns: - with pytest.warns(UserWarning, match="non-nanosecond precision timedelta"): - var = Variable(dims, values) - else: - with warnings.catch_warnings(): - warnings.simplefilter("error") - var = Variable(dims, values) - - assert var.dtype == np.dtype("timedelta64[ns]") - - -def test_pandas_two_only_timedelta_conversion_warning() -> None: - # Note this test relies on a pandas feature that is only present in pandas - # 2.0.0 and above, and so for now cannot be parametrized. - data = pd.timedelta_range("1", periods=1).astype("timedelta64[s]") - with pytest.warns(UserWarning, match="non-nanosecond precision timedelta"): - var = Variable(["time"], data) - - assert var.dtype == np.dtype("timedelta64[ns]") - - -@pytest.mark.parametrize( - ("index", "dtype"), - [ - (pd.date_range("2000", periods=1), "datetime64"), - (pd.timedelta_range("1", periods=1), "timedelta64"), - ], - ids=lambda x: f"{x}", -) -def test_pandas_indexing_adapter_non_nanosecond_conversion(index, dtype) -> None: - data = PandasIndexingAdapter(index.astype(f"{dtype}[s]")) - with pytest.warns(UserWarning, match="non-nanosecond precision"): - var = Variable(["time"], data) - assert var.dtype == np.dtype(f"{dtype}[ns]") + var = Variable(dims, values) + assert var.dtype == np.dtype(f"timedelta64[{unit}]")