Skip to content

Commit

Permalink
Merge branch 'main' into add-dt-features
Browse files Browse the repository at this point in the history
  • Loading branch information
aulemahal authored Sep 9, 2024
2 parents 31adaca + 0af1979 commit 02b06d5
Show file tree
Hide file tree
Showing 46 changed files with 790 additions and 397 deletions.
11 changes: 11 additions & 0 deletions asv_bench/benchmarks/datatree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import xarray as xr
from xarray.core.datatree import DataTree


class Datatree:
def setup(self):
run1 = DataTree.from_dict({"run1": xr.Dataset({"a": 1})})
self.d = {"run1": run1}

def time_from_dict(self):
DataTree.from_dict(self.d)
19 changes: 17 additions & 2 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# import flox to avoid the cost of first import
import cftime
import flox.xarray # noqa
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -96,7 +97,7 @@ def setup(self, *args, **kwargs):

requires_dask()
super().setup(**kwargs)
self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe()
self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dask_dataframe()
self.ds1d_mean = self.ds1d.groupby("b").mean().compute()

def time_binary_op_2d(self):
Expand Down Expand Up @@ -169,7 +170,21 @@ class GroupByLongTime:
def setup(self, use_cftime, use_flox):
arr = np.random.randn(10, 10, 365 * 30)
time = xr.date_range("2000", periods=30 * 365, use_cftime=use_cftime)
self.da = xr.DataArray(arr, dims=("y", "x", "time"), coords={"time": time})

# GH9426 - deep-copying CFTime object arrays is weirdly slow
asda = xr.DataArray(time)
labeled_time = []
for year, month in zip(asda.dt.year, asda.dt.month):
labeled_time.append(cftime.datetime(year, month, 1))

self.da = xr.DataArray(
arr,
dims=("y", "x", "time"),
coords={"time": time, "time2": ("time", labeled_time)},
)

def time_setup(self, use_cftime, use_flox):
self.da.groupby("time.month")

def time_mean(self, use_cftime, use_flox):
with xr.set_options(use_flox=use_flox):
Expand Down
4 changes: 2 additions & 2 deletions ci/requirements/bare-minimum.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@ dependencies:
- pytest-env
- pytest-xdist
- pytest-timeout
- numpy=1.23
- numpy=1.24
- packaging=23.1
- pandas=2.0
- pandas=2.1
24 changes: 12 additions & 12 deletions ci/requirements/min-all-deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,37 @@ dependencies:
# doc/user-guide/installing.rst, doc/user-guide/plotting.rst and setup.py.
- python=3.10
- array-api-strict=1.0 # dependency for testing the array api compat
- boto3=1.26
- boto3=1.28
- bottleneck=1.3
- cartopy=0.21
- cartopy=0.22
- cftime=1.6
- coveralls
- dask-core=2023.4
- distributed=2023.4
- dask-core=2023.9
- distributed=2023.9
# Flox > 0.8 has a bug with numbagg versions
# It will require numbagg > 0.6
# so we should just skip that series eventually
# or keep flox pinned for longer than necessary
- flox=0.7
- h5netcdf=1.1
- h5netcdf=1.2
# h5py and hdf5 tend to cause conflicts
# for e.g. hdf5 1.12 conflicts with h5py=3.1
# prioritize bumping other packages instead
- h5py=3.8
- hdf5=1.12
- hypothesis
- iris=3.4
- iris=3.7
- lxml=4.9 # Optional dep of pydap
- matplotlib-base=3.7
- nc-time-axis=1.4
# netcdf follows a 1.major.minor[.patch] convention
# (see https://github.com/Unidata/netcdf4-python/issues/1090)
- netcdf4=1.6.0
- numba=0.56
- numba=0.57
- numbagg=0.2.1
- numpy=1.23
- numpy=1.24
- packaging=23.1
- pandas=2.0
- pandas=2.1
- pint=0.22
- pip
- pydap=3.4
Expand All @@ -49,9 +49,9 @@ dependencies:
- pytest-xdist
- pytest-timeout
- rasterio=1.3
- scipy=1.10
- scipy=1.11
- seaborn=0.12
- sparse=0.14
- toolz=0.12
- typing_extensions=4.5
- zarr=2.14
- typing_extensions=4.7
- zarr=2.16
2 changes: 1 addition & 1 deletion design_notes/flexible_indexes_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ An `XarrayIndex` subclass must/should/may implement the following properties/met
- a `data` property to access index's data and map it to coordinate data (see [Section 4](#4-indexvariable))
- a `__getitem__()` implementation to propagate the index through DataArray/Dataset indexing operations
- `equals()`, `union()` and `intersection()` methods for data alignment (see [Section 2.6](#26-using-indexes-for-data-alignment))
- Xarray coordinate getters (see [Section 2.2.4](#224-implicit-coodinates))
- Xarray coordinate getters (see [Section 2.2.4](#224-implicit-coordinates))
- a method that may return a new index and that will be called when one of the corresponding coordinates is dropped from the Dataset/DataArray (multi-coordinate indexes)
- `encode()`/`decode()` methods that would allow storage-agnostic serialization and fast-path reconstruction of the underlying index object(s) (see [Section 2.8](#28-index-encoding))
- one or more "non-standard" methods or properties that could be leveraged in Xarray 3rd-party extensions like Dataset/DataArray accessors (see [Section 2.7](#27-using-indexes-for-other-purposes))
Expand Down
2 changes: 1 addition & 1 deletion design_notes/grouper_objects.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ where `|` represents chunk boundaries. A simple rechunking to
```
000|111122|3333
```
would make this resampling reduction an embarassingly parallel blockwise problem.
would make this resampling reduction an embarrassingly parallel blockwise problem.

Similarly consider monthly-mean climatologies for which the month numbers might be
```
Expand Down
2 changes: 1 addition & 1 deletion design_notes/named_array_design_doc.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ Questions:
Variable.coarsen_reshape
Variable.rolling_window

Variable.set_dims # split this into broadcas_to and expand_dims
Variable.set_dims # split this into broadcast_to and expand_dims


# Reordering/Reshaping
Expand Down
4 changes: 2 additions & 2 deletions doc/user-guide/dask.rst
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ Automatic parallelization with ``apply_ufunc`` and ``map_blocks``

.. tip::

Some problems can become embarassingly parallel and thus easy to parallelize
Some problems can become embarrassingly parallel and thus easy to parallelize
automatically by rechunking to a frequency, e.g. ``ds.chunk(time=TimeResampler("YE"))``.
See :py:meth:`Dataset.chunk` for more.

Expand Down Expand Up @@ -559,7 +559,7 @@ larger chunksizes.

.. tip::

Many time domain problems become amenable to an embarassingly parallel or blockwise solution
Many time domain problems become amenable to an embarrassingly parallel or blockwise solution
(e.g. using :py:func:`xarray.map_blocks`, :py:func:`dask.array.map_blocks`, or
:py:func:`dask.array.blockwise`) by rechunking to a frequency along the time dimension.
Provide :py:class:`xarray.groupers.TimeResampler` objects to :py:meth:`Dataset.chunk` to do so.
Expand Down
2 changes: 1 addition & 1 deletion doc/user-guide/data-structures.rst
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ pressure that were made under various conditions:
* the measurements were made on four different days;
* they were made at two separate locations, which we will represent using
their latitude and longitude; and
* they were made using instruments by three different manufacutrers, which we
* they were made using instruments by three different manufacturers, which we
will refer to as `'manufac1'`, `'manufac2'`, and `'manufac3'`.

.. ipython:: python
Expand Down
6 changes: 6 additions & 0 deletions doc/user-guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,12 @@ Use grouper objects to group by multiple dimensions:
from xarray.groupers import UniqueGrouper
da.groupby(["lat", "lon"]).sum()
The above is sugar for using ``UniqueGrouper`` objects directly:

.. ipython:: python
da.groupby(lat=UniqueGrouper(), lon=UniqueGrouper()).sum()
Expand Down
2 changes: 1 addition & 1 deletion doc/user-guide/pandas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ Particularly after a roundtrip, the following deviations are noted:

- a non-dimension Dataset ``coordinate`` is converted into ``variable``
- a non-dimension DataArray ``coordinate`` is not converted
- ``dtype`` is not allways the same (e.g. "str" is converted to "object")
- ``dtype`` is not always the same (e.g. "str" is converted to "object")
- ``attrs`` metadata is not conserved

To avoid these problems, the third-party `ntv-pandas <https://github.com/loco-philippe/ntv-pandas>`__ library offers lossless and reversible conversions between
Expand Down
34 changes: 31 additions & 3 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,24 @@ Performance
Breaking changes
~~~~~~~~~~~~~~~~
- Support for ``python 3.9`` has been dropped (:pull:`8937`)
- The minimum versions of some dependencies were changed

===================== ========= =======
Package Old New
===================== ========= =======
boto3 1.26 1.28
cartopy 0.21 0.22
dask-core 2023.4 2023.9
distributed 2023.4 2023.9
h5netcdf 1.1 1.2
iris 3.4 3.7
numba 0.56 0.57
numpy 1.23 1.24
pandas 2.0 2.1
scipy 1.10 1.11
typing_extensions 4.5 4.7
zarr 2.14 2.16
===================== ========= =======


Deprecations
Expand All @@ -67,6 +85,16 @@ Bug fixes
- Fix deprecation warning that was raised when calling ``np.array`` on an ``xr.DataArray``
in NumPy 2.0 (:issue:`9312`, :pull:`9393`)
By `Andrew Scherer <https://github.com/andrew-s28>`_.
- Fix support for using ``pandas.DateOffset``, ``pandas.Timedelta``, and
``datetime.timedelta`` objects as ``resample`` frequencies
(:issue:`9408`, :pull:`9413`).
By `Oliver Higgs <https://github.com/oliverhiggs>`_.

Performance
~~~~~~~~~~~

- Speed up grouping by avoiding deep-copy of non-dimension coordinates (:issue:`9426`, :pull:`9393`)
By `Deepak Cherian <https://github.com/dcherian>`_.

Documentation
~~~~~~~~~~~~~
Expand Down Expand Up @@ -101,7 +129,7 @@ New Features
(:issue:`6610`, :pull:`8840`).
By `Deepak Cherian <https://github.com/dcherian>`_.
- Allow rechunking to a frequency using ``Dataset.chunk(time=TimeResampler("YE"))`` syntax. (:issue:`7559`, :pull:`9109`)
Such rechunking allows many time domain analyses to be executed in an embarassingly parallel fashion.
Such rechunking allows many time domain analyses to be executed in an embarrassingly parallel fashion.
By `Deepak Cherian <https://github.com/dcherian>`_.
- Allow per-variable specification of ```mask_and_scale``, ``decode_times``, ``decode_timedelta``
``use_cftime`` and ``concat_characters`` params in :py:func:`~xarray.open_dataset` (:pull:`9218`).
Expand Down Expand Up @@ -134,7 +162,7 @@ Breaking changes

Bug fixes
~~~~~~~~~
- Fix scatter plot broadcasting unneccesarily. (:issue:`9129`, :pull:`9206`)
- Fix scatter plot broadcasting unnecessarily. (:issue:`9129`, :pull:`9206`)
By `Jimmy Westling <https://github.com/illviljan>`_.
- Don't convert custom indexes to ``pandas`` indexes when computing a diff (:pull:`9157`)
By `Justus Magin <https://github.com/keewis>`_.
Expand Down Expand Up @@ -597,7 +625,7 @@ Internal Changes
~~~~~~~~~~~~~~~~

- The implementation of :py:func:`map_blocks` has changed to minimize graph size and duplication of data.
This should be a strict improvement even though the graphs are not always embarassingly parallel any more.
This should be a strict improvement even though the graphs are not always embarrassingly parallel any more.
Please open an issue if you spot a regression. (:pull:`8412`, :issue:`8409`).
By `Deepak Cherian <https://github.com/dcherian>`_.
- Remove null values before plotting. (:pull:`8535`).
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ readme = "README.md"
requires-python = ">=3.10"

dependencies = [
"numpy>=1.23",
"numpy>=1.24",
"packaging>=23.1",
"pandas>=2.0",
"pandas>=2.1",
]

[project.optional-dependencies]
Expand Down
48 changes: 42 additions & 6 deletions xarray/coding/cftime_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from collections.abc import Mapping
from datetime import datetime, timedelta
from functools import partial
from typing import TYPE_CHECKING, ClassVar, Literal
from typing import TYPE_CHECKING, ClassVar, Literal, TypeVar

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -80,6 +80,7 @@


DayOption: TypeAlias = Literal["start", "end"]
T_FreqStr = TypeVar("T_FreqStr", str, None)


def _nanosecond_precision_timestamp(*args, **kwargs):
Expand Down Expand Up @@ -739,7 +740,7 @@ def _generate_anchored_deprecated_frequencies(
return pairs


_DEPRECATED_FREQUENICES: dict[str, str] = {
_DEPRECATED_FREQUENCIES: dict[str, str] = {
"A": "YE",
"Y": "YE",
"AS": "YS",
Expand All @@ -765,32 +766,67 @@ def _generate_anchored_deprecated_frequencies(


def _emit_freq_deprecation_warning(deprecated_freq):
recommended_freq = _DEPRECATED_FREQUENICES[deprecated_freq]
recommended_freq = _DEPRECATED_FREQUENCIES[deprecated_freq]
message = _DEPRECATION_MESSAGE.format(
deprecated_freq=deprecated_freq, recommended_freq=recommended_freq
)
emit_user_level_warning(message, FutureWarning)


def to_offset(freq: BaseCFTimeOffset | str, warn: bool = True) -> BaseCFTimeOffset:
def to_offset(
freq: BaseCFTimeOffset | str | timedelta | pd.Timedelta | pd.DateOffset,
warn: bool = True,
) -> BaseCFTimeOffset:
"""Convert a frequency string to the appropriate subclass of
BaseCFTimeOffset."""
if isinstance(freq, BaseCFTimeOffset):
return freq
if isinstance(freq, timedelta | pd.Timedelta):
return delta_to_tick(freq)
if isinstance(freq, pd.DateOffset):
freq = _legacy_to_new_freq(freq.freqstr)

match = re.match(_PATTERN, freq)
if match is None:
raise ValueError("Invalid frequency string provided")
freq_data = match.groupdict()

freq = freq_data["freq"]
if warn and freq in _DEPRECATED_FREQUENICES:
if warn and freq in _DEPRECATED_FREQUENCIES:
_emit_freq_deprecation_warning(freq)
multiples = freq_data["multiple"]
multiples = 1 if multiples is None else int(multiples)
return _FREQUENCIES[freq](n=multiples)


def delta_to_tick(delta: timedelta | pd.Timedelta) -> Tick:
"""Adapted from pandas.tslib.delta_to_tick"""
if isinstance(delta, pd.Timedelta) and delta.nanoseconds != 0:
# pandas.Timedelta has nanoseconds, but these are not supported
raise ValueError(
"Unable to convert 'pandas.Timedelta' object with non-zero "
"nanoseconds to 'CFTimeOffset' object"
)
if delta.microseconds == 0:
if delta.seconds == 0:
return Day(n=delta.days)
else:
seconds = delta.days * 86400 + delta.seconds
if seconds % 3600 == 0:
return Hour(n=seconds // 3600)
elif seconds % 60 == 0:
return Minute(n=seconds // 60)
else:
return Second(n=seconds)
else:
# Regardless of the days and seconds this will always be a Millisecond
# or Microsecond object
if delta.microseconds % 1_000 == 0:
return Millisecond(n=delta.microseconds // 1_000)
else:
return Microsecond(n=delta.microseconds)


def to_cftime_datetime(date_str_or_date, calendar=None):
if cftime is None:
raise ModuleNotFoundError("No module named 'cftime'")
Expand Down Expand Up @@ -1332,7 +1368,7 @@ def _new_to_legacy_freq(freq):
return freq


def _legacy_to_new_freq(freq):
def _legacy_to_new_freq(freq: T_FreqStr) -> T_FreqStr:
# to avoid internal deprecation warnings when freq is determined using pandas < 2.2

# TODO: remove once requiring pandas >= 2.2
Expand Down
Loading

0 comments on commit 02b06d5

Please sign in to comment.