Merge branch 'main' into add-dt-features

pydata · Sep 9, 2024 · 02b06d5 · 02b06d5
2 parents 31adaca + 0af1979
commit 02b06d5
Show file tree

Hide file tree

Showing 46 changed files with 790 additions and 397 deletions.
diff --git a/asv_bench/benchmarks/datatree.py b/asv_bench/benchmarks/datatree.py
@@ -0,0 +1,11 @@
+import xarray as xr
+from xarray.core.datatree import DataTree
+
+
+class Datatree:
+    def setup(self):
+        run1 = DataTree.from_dict({"run1": xr.Dataset({"a": 1})})
+        self.d = {"run1": run1}
+
+    def time_from_dict(self):
+        DataTree.from_dict(self.d)
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -1,4 +1,5 @@
 # import flox to avoid the cost of first import
+import cftime
 import flox.xarray  # noqa
 import numpy as np
 import pandas as pd
@@ -96,7 +97,7 @@ def setup(self, *args, **kwargs):
 
         requires_dask()
         super().setup(**kwargs)
-        self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe()
+        self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dask_dataframe()
         self.ds1d_mean = self.ds1d.groupby("b").mean().compute()
 
     def time_binary_op_2d(self):
@@ -169,7 +170,21 @@ class GroupByLongTime:
     def setup(self, use_cftime, use_flox):
         arr = np.random.randn(10, 10, 365 * 30)
         time = xr.date_range("2000", periods=30 * 365, use_cftime=use_cftime)
-        self.da = xr.DataArray(arr, dims=("y", "x", "time"), coords={"time": time})
+
+        # GH9426 - deep-copying CFTime object arrays is weirdly slow
+        asda = xr.DataArray(time)
+        labeled_time = []
+        for year, month in zip(asda.dt.year, asda.dt.month):
+            labeled_time.append(cftime.datetime(year, month, 1))
+
+        self.da = xr.DataArray(
+            arr,
+            dims=("y", "x", "time"),
+            coords={"time": time, "time2": ("time", labeled_time)},
+        )
+
+    def time_setup(self, use_cftime, use_flox):
+        self.da.groupby("time.month")
 
     def time_mean(self, use_cftime, use_flox):
         with xr.set_options(use_flox=use_flox):

diff --git a/ci/requirements/bare-minimum.yml b/ci/requirements/bare-minimum.yml
@@ -11,6 +11,6 @@ dependencies:
   - pytest-env
   - pytest-xdist
   - pytest-timeout
-  - numpy=1.23
+  - numpy=1.24
   - packaging=23.1
-  - pandas=2.0
+  - pandas=2.1
diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml
@@ -9,37 +9,37 @@ dependencies:
   # doc/user-guide/installing.rst, doc/user-guide/plotting.rst and setup.py.
   - python=3.10
   - array-api-strict=1.0  # dependency for testing the array api compat
-  - boto3=1.26
+  - boto3=1.28
   - bottleneck=1.3
-  - cartopy=0.21
+  - cartopy=0.22
   - cftime=1.6
   - coveralls
-  - dask-core=2023.4
-  - distributed=2023.4
+  - dask-core=2023.9
+  - distributed=2023.9
   # Flox > 0.8 has a bug with numbagg versions
   # It will require numbagg > 0.6
   # so we should just skip that series eventually
   # or keep flox pinned for longer than necessary
   - flox=0.7
-  - h5netcdf=1.1
+  - h5netcdf=1.2
   # h5py and hdf5 tend to cause conflicts
   # for e.g. hdf5 1.12 conflicts with h5py=3.1
   # prioritize bumping other packages instead
   - h5py=3.8
   - hdf5=1.12
   - hypothesis
-  - iris=3.4
+  - iris=3.7
   - lxml=4.9  # Optional dep of pydap
   - matplotlib-base=3.7
   - nc-time-axis=1.4
   # netcdf follows a 1.major.minor[.patch] convention
   # (see https://github.com/Unidata/netcdf4-python/issues/1090)
   - netcdf4=1.6.0
-  - numba=0.56
+  - numba=0.57
   - numbagg=0.2.1
-  - numpy=1.23
+  - numpy=1.24
   - packaging=23.1
-  - pandas=2.0
+  - pandas=2.1
   - pint=0.22
   - pip
   - pydap=3.4
@@ -49,9 +49,9 @@ dependencies:
   - pytest-xdist
   - pytest-timeout
   - rasterio=1.3
-  - scipy=1.10
+  - scipy=1.11
   - seaborn=0.12
   - sparse=0.14
   - toolz=0.12
-  - typing_extensions=4.5
-  - zarr=2.14
+  - typing_extensions=4.7
+  - zarr=2.16
diff --git a/design_notes/flexible_indexes_notes.md b/design_notes/flexible_indexes_notes.md
@@ -71,7 +71,7 @@ An `XarrayIndex` subclass must/should/may implement the following properties/met
 - a `data` property to access index's data and map it to coordinate data (see [Section 4](#4-indexvariable))
 - a `__getitem__()` implementation to propagate the index through DataArray/Dataset indexing operations
 - `equals()`, `union()` and `intersection()` methods for data alignment (see [Section 2.6](#26-using-indexes-for-data-alignment))
-- Xarray coordinate getters (see [Section 2.2.4](#224-implicit-coodinates))
+- Xarray coordinate getters (see [Section 2.2.4](#224-implicit-coordinates))
 - a method that may return a new index and that will be called when one of the corresponding coordinates is dropped from the Dataset/DataArray (multi-coordinate indexes)
 - `encode()`/`decode()` methods that would allow storage-agnostic serialization and fast-path reconstruction of the underlying index object(s) (see [Section 2.8](#28-index-encoding))
 - one or more "non-standard" methods or properties that could be leveraged in Xarray 3rd-party extensions like Dataset/DataArray accessors (see [Section 2.7](#27-using-indexes-for-other-purposes))

diff --git a/design_notes/grouper_objects.md b/design_notes/grouper_objects.md
@@ -166,7 +166,7 @@ where `|` represents chunk boundaries. A simple rechunking to
 ```
 000|111122|3333
 ```
-would make this resampling reduction an embarassingly parallel blockwise problem.
+would make this resampling reduction an embarrassingly parallel blockwise problem.
 
 Similarly consider monthly-mean climatologies for which the month numbers might be
 ```

diff --git a/design_notes/named_array_design_doc.md b/design_notes/named_array_design_doc.md
@@ -258,7 +258,7 @@ Questions:
    Variable.coarsen_reshape
    Variable.rolling_window
 
-   Variable.set_dims # split this into broadcas_to and expand_dims
+   Variable.set_dims # split this into broadcast_to and expand_dims
 
 
 # Reordering/Reshaping

diff --git a/doc/user-guide/dask.rst b/doc/user-guide/dask.rst
@@ -298,7 +298,7 @@ Automatic parallelization with ``apply_ufunc`` and ``map_blocks``
 
 .. tip::
 
-   Some problems can become embarassingly parallel and thus easy to parallelize
+   Some problems can become embarrassingly parallel and thus easy to parallelize
    automatically by rechunking to a frequency, e.g. ``ds.chunk(time=TimeResampler("YE"))``.
    See :py:meth:`Dataset.chunk` for more.
 
@@ -559,7 +559,7 @@ larger chunksizes.
 
 .. tip::
 
-   Many time domain problems become amenable to an embarassingly parallel or blockwise solution
+   Many time domain problems become amenable to an embarrassingly parallel or blockwise solution
    (e.g. using :py:func:`xarray.map_blocks`, :py:func:`dask.array.map_blocks`, or
    :py:func:`dask.array.blockwise`) by rechunking to a frequency along the time dimension.
    Provide :py:class:`xarray.groupers.TimeResampler` objects to :py:meth:`Dataset.chunk` to do so.

diff --git a/doc/user-guide/data-structures.rst b/doc/user-guide/data-structures.rst
@@ -289,7 +289,7 @@ pressure that were made under various conditions:
 * the measurements were made on four different days;
 * they were made at two separate locations, which we will represent using
   their latitude and longitude; and
-* they were made using instruments by three different manufacutrers, which we
+* they were made using instruments by three different manufacturers, which we
   will refer to as `'manufac1'`, `'manufac2'`, and `'manufac3'`.
 
 .. ipython:: python

diff --git a/doc/user-guide/groupby.rst b/doc/user-guide/groupby.rst
@@ -305,6 +305,12 @@ Use grouper objects to group by multiple dimensions:
 
     from xarray.groupers import UniqueGrouper
 
+    da.groupby(["lat", "lon"]).sum()
+
+The above is sugar for using ``UniqueGrouper`` objects directly:
+
+.. ipython:: python
+
     da.groupby(lat=UniqueGrouper(), lon=UniqueGrouper()).sum()
 
 

diff --git a/doc/user-guide/pandas.rst b/doc/user-guide/pandas.rst
@@ -120,7 +120,7 @@ Particularly after a roundtrip, the following deviations are noted:
 
 - a non-dimension Dataset ``coordinate`` is converted into ``variable``
 - a non-dimension DataArray ``coordinate`` is not converted
-- ``dtype`` is not allways the same (e.g. "str" is converted to "object")
+- ``dtype`` is not always the same (e.g. "str" is converted to "object")
 - ``attrs`` metadata is not conserved
 
 To avoid these problems, the third-party `ntv-pandas <https://github.com/loco-philippe/ntv-pandas>`__ library offers lossless and reversible conversions between

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -42,6 +42,24 @@ Performance
 Breaking changes
 ~~~~~~~~~~~~~~~~
 - Support for ``python 3.9`` has been dropped (:pull:`8937`)
+- The minimum versions of some dependencies were changed
+
+  ===================== =========  =======
+   Package                    Old      New
+  ===================== =========  =======
+    boto3                   1.26      1.28
+    cartopy                 0.21      0.22
+    dask-core             2023.4    2023.9
+    distributed           2023.4    2023.9
+    h5netcdf                1.1        1.2
+    iris                    3.4        3.7
+    numba                   0.56      0.57
+    numpy                   1.23      1.24
+    pandas                  2.0        2.1
+    scipy                   1.10      1.11
+    typing_extensions       4.5        4.7
+    zarr                    2.14      2.16
+  ===================== =========  =======
 
 
 Deprecations
@@ -67,6 +85,16 @@ Bug fixes
 - Fix deprecation warning that was raised when calling ``np.array`` on an ``xr.DataArray``
   in NumPy 2.0 (:issue:`9312`, :pull:`9393`)
   By `Andrew Scherer <https://github.com/andrew-s28>`_.
+- Fix support for using ``pandas.DateOffset``, ``pandas.Timedelta``, and
+  ``datetime.timedelta`` objects as ``resample`` frequencies
+  (:issue:`9408`, :pull:`9413`).
+  By `Oliver Higgs <https://github.com/oliverhiggs>`_.
+
+Performance
+~~~~~~~~~~~
+
+- Speed up grouping by avoiding deep-copy of non-dimension coordinates (:issue:`9426`, :pull:`9393`)
+  By `Deepak Cherian <https://github.com/dcherian>`_.
 
 Documentation
 ~~~~~~~~~~~~~
@@ -101,7 +129,7 @@ New Features
   (:issue:`6610`, :pull:`8840`).
   By `Deepak Cherian <https://github.com/dcherian>`_.
 - Allow rechunking to a frequency using ``Dataset.chunk(time=TimeResampler("YE"))`` syntax. (:issue:`7559`, :pull:`9109`)
-  Such rechunking allows many time domain analyses to be executed in an embarassingly parallel fashion.
+  Such rechunking allows many time domain analyses to be executed in an embarrassingly parallel fashion.
   By `Deepak Cherian <https://github.com/dcherian>`_.
 - Allow per-variable specification of ```mask_and_scale``, ``decode_times``, ``decode_timedelta``
   ``use_cftime`` and ``concat_characters`` params in :py:func:`~xarray.open_dataset`  (:pull:`9218`).
@@ -134,7 +162,7 @@ Breaking changes
 
 Bug fixes
 ~~~~~~~~~
-- Fix scatter plot broadcasting unneccesarily. (:issue:`9129`, :pull:`9206`)
+- Fix scatter plot broadcasting unnecessarily. (:issue:`9129`, :pull:`9206`)
   By `Jimmy Westling <https://github.com/illviljan>`_.
 - Don't convert custom indexes to ``pandas`` indexes when computing a diff (:pull:`9157`)
   By `Justus Magin <https://github.com/keewis>`_.
@@ -597,7 +625,7 @@ Internal Changes
 ~~~~~~~~~~~~~~~~
 
 - The implementation of :py:func:`map_blocks` has changed to minimize graph size and duplication of data.
-  This should be a strict improvement even though the graphs are not always embarassingly parallel any more.
+  This should be a strict improvement even though the graphs are not always embarrassingly parallel any more.
   Please open an issue if you spot a regression. (:pull:`8412`, :issue:`8409`).
   By `Deepak Cherian <https://github.com/dcherian>`_.
 - Remove null values before plotting. (:pull:`8535`).

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,9 +22,9 @@ readme = "README.md"
 requires-python = ">=3.10"
 
 dependencies = [
-  "numpy>=1.23",
+  "numpy>=1.24",
   "packaging>=23.1",
-  "pandas>=2.0",
+  "pandas>=2.1",
 ]
 
 [project.optional-dependencies]

diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py
@@ -47,7 +47,7 @@
 from collections.abc import Mapping
 from datetime import datetime, timedelta
 from functools import partial
-from typing import TYPE_CHECKING, ClassVar, Literal
+from typing import TYPE_CHECKING, ClassVar, Literal, TypeVar
 
 import numpy as np
 import pandas as pd
@@ -80,6 +80,7 @@
 
 
 DayOption: TypeAlias = Literal["start", "end"]
+T_FreqStr = TypeVar("T_FreqStr", str, None)
 
 
 def _nanosecond_precision_timestamp(*args, **kwargs):
@@ -739,7 +740,7 @@ def _generate_anchored_deprecated_frequencies(
     return pairs
 
 
-_DEPRECATED_FREQUENICES: dict[str, str] = {
+_DEPRECATED_FREQUENCIES: dict[str, str] = {
     "A": "YE",
     "Y": "YE",
     "AS": "YS",
@@ -765,32 +766,67 @@ def _generate_anchored_deprecated_frequencies(
 
 
 def _emit_freq_deprecation_warning(deprecated_freq):
-    recommended_freq = _DEPRECATED_FREQUENICES[deprecated_freq]
+    recommended_freq = _DEPRECATED_FREQUENCIES[deprecated_freq]
     message = _DEPRECATION_MESSAGE.format(
         deprecated_freq=deprecated_freq, recommended_freq=recommended_freq
     )
     emit_user_level_warning(message, FutureWarning)
 
 
-def to_offset(freq: BaseCFTimeOffset | str, warn: bool = True) -> BaseCFTimeOffset:
+def to_offset(
+    freq: BaseCFTimeOffset | str | timedelta | pd.Timedelta | pd.DateOffset,
+    warn: bool = True,
+) -> BaseCFTimeOffset:
     """Convert a frequency string to the appropriate subclass of
     BaseCFTimeOffset."""
     if isinstance(freq, BaseCFTimeOffset):
         return freq
+    if isinstance(freq, timedelta | pd.Timedelta):
+        return delta_to_tick(freq)
+    if isinstance(freq, pd.DateOffset):
+        freq = _legacy_to_new_freq(freq.freqstr)
 
     match = re.match(_PATTERN, freq)
     if match is None:
         raise ValueError("Invalid frequency string provided")
     freq_data = match.groupdict()
 
     freq = freq_data["freq"]
-    if warn and freq in _DEPRECATED_FREQUENICES:
+    if warn and freq in _DEPRECATED_FREQUENCIES:
         _emit_freq_deprecation_warning(freq)
     multiples = freq_data["multiple"]
     multiples = 1 if multiples is None else int(multiples)
     return _FREQUENCIES[freq](n=multiples)
 
 
+def delta_to_tick(delta: timedelta | pd.Timedelta) -> Tick:
+    """Adapted from pandas.tslib.delta_to_tick"""
+    if isinstance(delta, pd.Timedelta) and delta.nanoseconds != 0:
+        # pandas.Timedelta has nanoseconds, but these are not supported
+        raise ValueError(
+            "Unable to convert 'pandas.Timedelta' object with non-zero "
+            "nanoseconds to 'CFTimeOffset' object"
+        )
+    if delta.microseconds == 0:
+        if delta.seconds == 0:
+            return Day(n=delta.days)
+        else:
+            seconds = delta.days * 86400 + delta.seconds
+            if seconds % 3600 == 0:
+                return Hour(n=seconds // 3600)
+            elif seconds % 60 == 0:
+                return Minute(n=seconds // 60)
+            else:
+                return Second(n=seconds)
+    else:
+        # Regardless of the days and seconds this will always be a Millisecond
+        # or Microsecond object
+        if delta.microseconds % 1_000 == 0:
+            return Millisecond(n=delta.microseconds // 1_000)
+        else:
+            return Microsecond(n=delta.microseconds)
+
+
 def to_cftime_datetime(date_str_or_date, calendar=None):
     if cftime is None:
         raise ModuleNotFoundError("No module named 'cftime'")
@@ -1332,7 +1368,7 @@ def _new_to_legacy_freq(freq):
     return freq
 
 
-def _legacy_to_new_freq(freq):
+def _legacy_to_new_freq(freq: T_FreqStr) -> T_FreqStr:
     # to avoid internal deprecation warnings when freq is determined using pandas < 2.2
 
     # TODO: remove once requiring pandas >= 2.2