diff --git a/.github/config.yml b/.github/config.yml new file mode 100644 index 00000000000..c64c2e28e59 --- /dev/null +++ b/.github/config.yml @@ -0,0 +1,23 @@ +# Comment to be posted to on first time issues +newIssueWelcomeComment: > + Thanks for opening your first issue here at xarray! Be sure to follow the issue template! + + If you have an idea for a solution, we would really welcome a Pull Request with proposed changes. + + See the [Contributing Guide](https://docs.xarray.dev/en/latest/contributing.html) for more. + + It may take us a while to respond here, but we really value your contribution. Contributors like you help make xarray better. + + Thank you! + +# Comment to be posted to on PRs from first time contributors in your repository +newPRWelcomeComment: > + Thank you for opening this pull request! It may take us a few days to respond here, so thank you for being patient. + + If you have questions, some answers may be found in our [contributing guidelines](http://docs.xarray.dev/en/stable/contributing.html). + +# Comment to be posted to on pull requests merged by a first time user +firstPRMergeComment: > + Congratulations on completing your first pull request! Welcome to Xarray! + We are proud of you, and hope to see you again! + ![celebration gif](https://media.giphy.com/media/umYMU8G2ixG5mJBDo5/giphy.gif) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 6f069af5da6..f98bf6f9f95 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -119,14 +119,14 @@ jobs: python xarray/util/print_versions.py - name: Install mypy run: | - python -m pip install 'mypy<0.990' + python -m pip install mypy --force-reinstall - name: Run mypy run: | python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v3.1.1 + uses: codecov/codecov-action@v3.1.2 with: file: mypy_report/cobertura.xml flags: mypy @@ -173,14 +173,14 @@ jobs: python xarray/util/print_versions.py - name: Install mypy run: | - python -m pip install 'mypy<0.990' + python -m pip install mypy --force-reinstall - name: Run mypy run: | python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v3.1.1 + uses: codecov/codecov-action@v3.1.2 with: file: mypy_report/cobertura.xml flags: mypy39 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index acace7aab95..06d7c4a83c7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -139,7 +139,7 @@ jobs: path: pytest.xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v3.1.1 + uses: codecov/codecov-action@v3.1.2 with: file: ./coverage.xml flags: unittests diff --git a/.github/workflows/pypi-release.yaml b/.github/workflows/pypi-release.yaml index a1e38644045..41957a941e2 100644 --- a/.github/workflows/pypi-release.yaml +++ b/.github/workflows/pypi-release.yaml @@ -72,7 +72,7 @@ jobs: - name: Publish package to TestPyPI if: github.event_name == 'push' - uses: pypa/gh-action-pypi-publish@v1.8.3 + uses: pypa/gh-action-pypi-publish@v1.8.5 with: user: __token__ password: ${{ secrets.TESTPYPI_TOKEN }} @@ -90,7 +90,7 @@ jobs: name: releases path: dist - name: Publish package to PyPI - uses: pypa/gh-action-pypi-publish@v1.8.3 + uses: pypa/gh-action-pypi-publish@v1.8.5 with: user: __token__ password: ${{ secrets.PYPI_TOKEN }} diff --git a/.github/workflows/testpypi-release.yaml b/.github/workflows/testpypi-release.yaml index b892e97268f..ddc6a2bddf3 100644 --- a/.github/workflows/testpypi-release.yaml +++ b/.github/workflows/testpypi-release.yaml @@ -78,7 +78,7 @@ jobs: - name: Publish package to TestPyPI if: github.event_name == 'push' - uses: pypa/gh-action-pypi-publish@v1.8.3 + uses: pypa/gh-action-pypi-publish@v1.8.5 with: user: __token__ password: ${{ secrets.TESTPYPI_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 45c15da8236..a69ca2f080f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,25 +16,24 @@ repos: files: ^xarray/ - repo: https://github.com/charliermarsh/ruff-pre-commit # Ruff version. - rev: 'v0.0.259' + rev: 'v0.0.261' hooks: - id: ruff args: ["--fix"] # https://github.com/python/black#version-control-integration - repo: https://github.com/psf/black - rev: 23.1.0 + rev: 23.3.0 hooks: - - id: black - id: black-jupyter - repo: https://github.com/keewis/blackdoc rev: v0.3.8 hooks: - id: blackdoc exclude: "generate_aggregations.py" - additional_dependencies: ["black==23.1.0"] + additional_dependencies: ["black==23.3.0"] - id: blackdoc-autoupdate-black - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.1.1 + rev: v1.2.0 hooks: - id: mypy # Copied from setup.cfg diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 6f8a306fc43..f8387aca856 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -68,7 +68,8 @@ "distributed": [""], "flox": [""], "numpy_groupies": [""], - "sparse": [""] + "sparse": [""], + "cftime": [""] }, diff --git a/asv_bench/benchmarks/alignment.py b/asv_bench/benchmarks/alignment.py new file mode 100644 index 00000000000..5a6ee3fa0a6 --- /dev/null +++ b/asv_bench/benchmarks/alignment.py @@ -0,0 +1,54 @@ +import numpy as np + +import xarray as xr + +from . import parameterized, requires_dask + +ntime = 365 * 30 +nx = 50 +ny = 50 + +rng = np.random.default_rng(0) + + +class Align: + def setup(self, *args, **kwargs): + data = rng.standard_normal((ntime, nx, ny)) + self.ds = xr.Dataset( + {"temperature": (("time", "x", "y"), data)}, + coords={ + "time": xr.date_range("2000", periods=ntime), + "x": np.arange(nx), + "y": np.arange(ny), + }, + ) + self.year = self.ds.time.dt.year + self.idx = np.unique(rng.integers(low=0, high=ntime, size=ntime // 2)) + self.year_subset = self.year.isel(time=self.idx) + + @parameterized(["join"], [("outer", "inner", "left", "right", "exact", "override")]) + def time_already_aligned(self, join): + xr.align(self.ds, self.year, join=join) + + @parameterized(["join"], [("outer", "inner", "left", "right")]) + def time_not_aligned(self, join): + xr.align(self.ds, self.year[-100:], join=join) + + @parameterized(["join"], [("outer", "inner", "left", "right")]) + def time_not_aligned_random_integers(self, join): + xr.align(self.ds, self.year_subset, join=join) + + +class AlignCFTime(Align): + def setup(self, *args, **kwargs): + super().setup() + self.ds["time"] = xr.date_range("2000", periods=ntime, calendar="noleap") + self.year = self.ds.time.dt.year + self.year_subset = self.year.isel(time=self.idx) + + +class AlignDask(Align): + def setup(self, *args, **kwargs): + requires_dask() + super().setup() + self.ds = self.ds.chunk({"time": 100}) diff --git a/ci/requirements/all-but-dask.yml b/ci/requirements/all-but-dask.yml index 1387466b702..0c5c2fcdc1a 100644 --- a/ci/requirements/all-but-dask.yml +++ b/ci/requirements/all-but-dask.yml @@ -25,7 +25,7 @@ dependencies: - numbagg - numpy<1.24 - packaging - - pandas<2 + - pandas - pint - pip - pseudonetcdf diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml index 2d35ab8724b..115d7dfa533 100644 --- a/ci/requirements/doc.yml +++ b/ci/requirements/doc.yml @@ -19,7 +19,7 @@ dependencies: - numba - numpy>=1.21,<1.24 - packaging>=21.3 - - pandas>=1.4,<2 + - pandas>=1.4 - pooch - pip - pre-commit diff --git a/ci/requirements/environment-py311.yml b/ci/requirements/environment-py311.yml index cd9edbb5052..97934e47b3b 100644 --- a/ci/requirements/environment-py311.yml +++ b/ci/requirements/environment-py311.yml @@ -27,7 +27,7 @@ dependencies: - numexpr - numpy - packaging - - pandas<2 + - pandas - pint - pip - pooch diff --git a/ci/requirements/environment-windows-py311.yml b/ci/requirements/environment-windows-py311.yml index effef0d7961..97cfb89ff6f 100644 --- a/ci/requirements/environment-windows-py311.yml +++ b/ci/requirements/environment-windows-py311.yml @@ -24,7 +24,7 @@ dependencies: # - numbagg - numpy - packaging - - pandas<2 + - pandas - pint - pip - pre-commit diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml index c02907b24ac..87cce0a06a7 100644 --- a/ci/requirements/environment-windows.yml +++ b/ci/requirements/environment-windows.yml @@ -24,7 +24,7 @@ dependencies: - numbagg - numpy<1.24 - packaging - - pandas<2 + - pandas - pint - pip - pre-commit diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 9abe1b295a2..626a5372801 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -27,7 +27,7 @@ dependencies: - numexpr - numpy<1.24 - packaging - - pandas<2 + - pandas - pint - pip - pooch diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 04013d545c3..5d825be2e08 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -234,6 +234,7 @@ Variable.dims Variable.dtype Variable.encoding + Variable.reset_encoding Variable.imag Variable.nbytes Variable.ndim @@ -374,10 +375,8 @@ CFTimeIndex.is_floating CFTimeIndex.is_integer CFTimeIndex.is_interval - CFTimeIndex.is_mixed CFTimeIndex.is_numeric CFTimeIndex.is_object - CFTimeIndex.is_type_compatible CFTimeIndex.isin CFTimeIndex.isna CFTimeIndex.isnull @@ -398,7 +397,6 @@ CFTimeIndex.round CFTimeIndex.searchsorted CFTimeIndex.set_names - CFTimeIndex.set_value CFTimeIndex.shift CFTimeIndex.slice_indexer CFTimeIndex.slice_locs @@ -412,7 +410,6 @@ CFTimeIndex.to_flat_index CFTimeIndex.to_frame CFTimeIndex.to_list - CFTimeIndex.to_native_types CFTimeIndex.to_numpy CFTimeIndex.to_series CFTimeIndex.tolist @@ -437,8 +434,6 @@ CFTimeIndex.hasnans CFTimeIndex.hour CFTimeIndex.inferred_type - CFTimeIndex.is_all_dates - CFTimeIndex.is_monotonic CFTimeIndex.is_monotonic_increasing CFTimeIndex.is_monotonic_decreasing CFTimeIndex.is_unique @@ -483,7 +478,6 @@ backends.NetCDF4DataStore.is_remote backends.NetCDF4DataStore.lock - backends.NetCDF4BackendEntrypoint.available backends.NetCDF4BackendEntrypoint.description backends.NetCDF4BackendEntrypoint.url backends.NetCDF4BackendEntrypoint.guess_can_open @@ -516,7 +510,6 @@ backends.H5NetCDFStore.sync backends.H5NetCDFStore.ds - backends.H5netcdfBackendEntrypoint.available backends.H5netcdfBackendEntrypoint.description backends.H5netcdfBackendEntrypoint.url backends.H5netcdfBackendEntrypoint.guess_can_open @@ -531,7 +524,6 @@ backends.PseudoNetCDFDataStore.open_store_variable backends.PseudoNetCDFDataStore.ds - backends.PseudoNetCDFBackendEntrypoint.available backends.PseudoNetCDFBackendEntrypoint.description backends.PseudoNetCDFBackendEntrypoint.url backends.PseudoNetCDFBackendEntrypoint.guess_can_open @@ -546,7 +538,6 @@ backends.PydapDataStore.open backends.PydapDataStore.open_store_variable - backends.PydapBackendEntrypoint.available backends.PydapBackendEntrypoint.description backends.PydapBackendEntrypoint.url backends.PydapBackendEntrypoint.guess_can_open @@ -574,7 +565,6 @@ backends.ScipyDataStore.sync backends.ScipyDataStore.ds - backends.ScipyBackendEntrypoint.available backends.ScipyBackendEntrypoint.description backends.ScipyBackendEntrypoint.url backends.ScipyBackendEntrypoint.guess_can_open @@ -595,13 +585,11 @@ backends.ZarrStore.sync backends.ZarrStore.ds - backends.ZarrBackendEntrypoint.available backends.ZarrBackendEntrypoint.description backends.ZarrBackendEntrypoint.url backends.ZarrBackendEntrypoint.guess_can_open backends.ZarrBackendEntrypoint.open_dataset - backends.StoreBackendEntrypoint.available backends.StoreBackendEntrypoint.description backends.StoreBackendEntrypoint.url backends.StoreBackendEntrypoint.guess_can_open diff --git a/doc/api.rst b/doc/api.rst index 34d867cde65..d180aa66d25 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -112,6 +112,7 @@ Dataset contents Dataset.drop_dims Dataset.set_coords Dataset.reset_coords + Dataset.reset_encoding Dataset.convert_calendar Dataset.interp_calendar Dataset.get_index @@ -303,6 +304,7 @@ DataArray contents DataArray.drop_indexes DataArray.drop_duplicates DataArray.reset_coords + DataArray.reset_encoding DataArray.copy DataArray.convert_calendar DataArray.interp_calendar @@ -640,6 +642,7 @@ DataArray methods DataArray.to_numpy DataArray.to_pandas DataArray.to_series + DataArray.to_zarr DataArray.chunk DataArray.close DataArray.compute @@ -1094,6 +1097,7 @@ Advanced API backends.BackendArray backends.BackendEntrypoint backends.list_engines + backends.refresh_engines Default, pandas-backed indexes built-in Xarray: diff --git a/doc/contributing.rst b/doc/contributing.rst index 07938f23c9f..cdccd41d25c 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -4,29 +4,46 @@ Contributing to xarray ********************** - .. note:: Large parts of this document came from the `Pandas Contributing Guide `_. +Overview +======== + +We welcome your skills and enthusiasm at the xarray project!. There are numerous opportunities to +contribute beyond just writing code. +All contributions, including bug reports, bug fixes, documentation improvements, enhancement suggestions, +and other ideas are welcome. + +If you have any questions on the process or how to fix something feel free to ask us! +The recommended place to ask a question is on `GitHub Discussions `_ +, but we also have a `Discord `_ and a +`mailing list `_. There is also a +`"python-xarray" tag on Stack Overflow `_ which we monitor for questions. + +We also have a biweekly community call, details of which are announced on the +`Developers meeting `_. +You are very welcome to join! Though we would love to hear from you, there is no expectation to +contribute during the meeting either - you are always welcome to just sit in and listen. + +This project is a community effort, and everyone is welcome to contribute. Everyone within the community +is expected to abide by our `code of conduct `_. + Where to start? =============== -All contributions, bug reports, bug fixes, documentation improvements, -enhancements, and ideas are welcome. - If you are brand new to *xarray* or open-source development, we recommend going through the `GitHub "issues" tab `_ -to find issues that interest you. There are a number of issues listed under -`Documentation `_ +to find issues that interest you. +Some issues are particularly suited for new contributors by the label `Documentation `_ and `good first issue -`_ -where you could start out. Once you've found an interesting issue, you can -return here to get your development environment setup. +`_ where you could start out. +These are well documented issues, that do not require a deep understanding of the internals of xarray. -Feel free to ask questions on the `mailing list -`_. +Once you've found an interesting issue, you can return here to get your development environment setup. +The xarray project does not assign issues. Issues are "assigned" by opening a Pull Request(PR). .. _contributing.bug_reports: @@ -34,15 +51,20 @@ Bug reports and enhancement requests ==================================== Bug reports are an important part of making *xarray* more stable. Having a complete bug -report will allow others to reproduce the bug and provide insight into fixing. See -this `stackoverflow article for tips on -writing a good bug report `_ . +report will allow others to reproduce the bug and provide insight into fixing. Trying out the bug-producing code on the *main* branch is often a worthwhile exercise to confirm that the bug still exists. It is also worth searching existing bug reports and pull requests to see if the issue has already been reported and/or fixed. -Bug reports must: +Submitting a bug report +----------------------- + +If you find a bug in the code or documentation, do not hesitate to submit a ticket to the +`Issue Tracker `_. +You are also welcome to post feature requests or pull requests. + +If you are reporting a bug, please use the provided template which includes the following: #. Include a short, self-contained Python snippet reproducing the problem. You can format the code nicely by using `GitHub Flavored Markdown @@ -67,13 +89,12 @@ Bug reports must: #. Explain why the current behavior is wrong/not desired and what you expect instead. -The issue will then show up to the *xarray* community and be open to comments/ideas -from others. +The issue will then show up to the *xarray* community and be open to comments/ideas from others. -.. _contributing.github: +See this `stackoverflow article for tips on writing a good bug report `_ . -Working with the code -===================== + +.. _contributing.github: Now that you have an issue you want to fix, enhancement to add, or documentation to improve, you need to learn how to work with GitHub and the *xarray* code base. @@ -81,12 +102,7 @@ to improve, you need to learn how to work with GitHub and the *xarray* code base .. _contributing.version_control: Version control, Git, and GitHub --------------------------------- - -To the new user, working with Git is one of the more daunting aspects of contributing -to *xarray*. It can very quickly become overwhelming, but sticking to the guidelines -below will help keep the process straightforward and mostly trouble free. As always, -if you are having difficulties please feel free to ask for help. +================================ The code is hosted on `GitHub `_. To contribute you will need to sign up for a `free GitHub account @@ -112,41 +128,41 @@ you can work seamlessly between your local repository and GitHub. but contributors who are new to git may find it easier to use other tools instead such as `Github Desktop `_. -.. _contributing.forking: +Development workflow +==================== + +To keep your work well organized, with readable history, and in turn make it easier for project +maintainers to see what you've done, and why you did it, we recommend you to follow workflow: -Forking -------- +1. `Create an account `_ on GitHub if you do not already have one. -You will need your own fork to work on the code. Go to the `xarray project -page `_ and hit the ``Fork`` button. You will -want to clone your fork to your machine:: +2. You will need your own fork to work on the code. Go to the `xarray project + page `_ and hit the ``Fork`` button near the top of the page. + This creates a copy of the code under your account on the GitHub server. + +3. Clone your fork to your machine:: git clone https://github.com/your-user-name/xarray.git cd xarray git remote add upstream https://github.com/pydata/xarray.git -This creates the directory `xarray` and connects your repository to -the upstream (main project) *xarray* repository. - -Creating a branch ------------------ - -You want your ``main`` branch to reflect only production-ready code, so create a -feature branch before making your changes. For example:: + This creates the directory `xarray` and connects your repository to + the upstream (main project) *xarray* repository. - git branch shiny-new-feature - git checkout shiny-new-feature +Creating a development environment +---------------------------------- -The above can be simplified to:: +To test out code changes locally, you'll need to build *xarray* from source, which requires you to +`create a local development environment `_. - git checkout -b shiny-new-feature +Update the ``main`` branch +-------------------------- -This changes your working directory to the shiny-new-feature branch. Keep any -changes in this branch specific to one bug or feature so it is clear -what the branch brings to *xarray*. You can have many "shiny-new-features" -and switch in between them using the ``git checkout`` command. +First make sure you have followed `Setting up xarray for development +`_ -To update this branch, you need to retrieve the changes from the ``main`` branch:: +Before starting a new set of changes, fetch all changes from ``upstream/main``, and start a new +feature branch from that. From time to time you should fetch the upstream changes from GitHub: :: git fetch upstream git merge upstream/main @@ -157,10 +173,83 @@ request. If you have uncommitted changes, you will need to ``git stash`` them prior to updating. This will effectively store your changes, which can be reapplied after updating. +Create a new feature branch +--------------------------- + +Create a branch to save your changes, even before you start making changes. You want your +``main branch`` to contain only production-ready code:: + + git checkout -b shiny-new-feature + +This changes your working directory to the ``shiny-new-feature`` branch. Keep any changes in this +branch specific to one bug or feature so it is clear what the branch brings to *xarray*. You can have +many "shiny-new-features" and switch in between them using the ``git checkout`` command. + +Generally, you will want to keep your feature branches on your public GitHub fork of xarray. To do this, +you ``git push`` this new branch up to your GitHub repo. Generally (if you followed the instructions in +these pages, and by default), git will have a link to your fork of the GitHub repo, called ``origin``. +You push up to your own fork with: :: + + git push origin shiny-new-feature + +In git >= 1.7 you can ensure that the link is correctly set by using the ``--set-upstream`` option: :: + + git push --set-upstream origin shiny-new-feature + +From now on git will know that ``shiny-new-feature`` is related to the ``shiny-new-feature branch`` in the GitHub repo. + +The editing workflow +-------------------- + +1. Make some changes + +2. See which files have changed with ``git status``. You'll see a listing like this one: :: + + # On branch shiny-new-feature + # Changed but not updated: + # (use "git add ..." to update what will be committed) + # (use "git checkout -- ..." to discard changes in working directory) + # + # modified: README + +3. Check what the actual changes are with ``git diff``. + +4. Build the `documentation run `_ +for the documentation changes. + +`Run the test suite `_ for code changes. + +Commit and push your changes +---------------------------- + +1. To commit all modified files into the local copy of your repo, do ``git commit -am 'A commit message'``. + +2. To push the changes up to your forked repo on GitHub, do a ``git push``. + +Open a pull request +------------------- + +When you're ready or need feedback on your code, open a Pull Request (PR) so that the xarray developers can +give feedback and eventually include your suggested code into the ``main`` branch. +`Pull requests (PRs) on GitHub `_ +are the mechanism for contributing to xarray's code and documentation. + +Enter a title for the set of changes with some explanation of what you've done. +Follow the PR template, which looks like this. :: + + [ ]Closes #xxxx + [ ]Tests added + [ ]User visible changes (including notable bug fixes) are documented in whats-new.rst + [ ]New functions/methods are listed in api.rst + +Mention anything you'd like particular attention for - such as a complicated change or some code you are not happy with. +If you don't think your request is ready to be merged, just say so in your pull request message and use +the "Draft PR" feature of GitHub. This is a good way of getting some preliminary code review. + .. _contributing.dev_env: Creating a development environment ----------------------------------- +================================== To test out code changes locally, you'll need to build *xarray* from source, which requires a Python environment. If you're making documentation changes, you can @@ -182,7 +271,7 @@ documentation locally before pushing your changes. .. _contributing.dev_python: Creating a Python Environment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +----------------------------- Before starting any development, you'll need to create an isolated xarray development environment: @@ -240,6 +329,22 @@ To return to your root environment:: See the full `conda docs here `__. +Install pre-commit hooks +------------------------ + +We highly recommend that you setup `pre-commit `_ hooks to automatically +run all the above tools every time you make a git commit. To install the hooks:: + + python -m pip install pre-commit + pre-commit install + +This can be done by running: :: + + pre-commit run + +from the root of the xarray repository. You can skip the pre-commit checks with +``git commit --no-verify``. + .. _contributing.documentation: Contributing to the documentation @@ -363,6 +468,60 @@ If you want to do a full clean build, do:: make clean make html +Writing ReST pages +------------------ + +Most documentation is either in the docstrings of individual classes and methods, in explicit +``.rst`` files, or in examples and tutorials. All of these use the +`ReST `_ syntax and are processed by +`Sphinx `_. + +This section contains additional information and conventions how ReST is used in the +xarray documentation. + +Section formatting +~~~~~~~~~~~~~~~~~~ + +We aim to follow the recommendations from the +`Python documentation `_ +and the `Sphinx reStructuredText documentation `_ +for section markup characters, + +- ``*`` with overline, for chapters + +- ``=``, for heading + +- ``-``, for sections + +- ``~``, for subsections + +- ``**`` text ``**``, for **bold** text + +Referring to other documents and sections +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +`Sphinx `_ allows internal +`references `_ between documents. + +Documents can be linked with the ``:doc:`` directive: + +:: + + See the :doc:`/getting-started-guide/installing` + + See the :doc:`/getting-started-guide/quick-overview` + +will render as: + +See the `Installation `_ + +See the `Quick Overview `_ + +Including figures and files +--------------------------- + +Image files can be directly included in pages with the ``image::`` directive. + .. _contributing.code: Contributing to the code base @@ -490,9 +649,7 @@ Writing tests All tests should go into the ``tests`` subdirectory of the specific package. This folder contains many current examples of tests, and we suggest looking to these for -inspiration. If your test requires working with files or -network connectivity, there is more information on the `testing page -`_ of the wiki. +inspiration. The ``xarray.testing`` module has many special ``assert`` functions that make it easier to make statements about whether DataArray or Dataset objects are diff --git a/doc/getting-started-guide/faq.rst b/doc/getting-started-guide/faq.rst index 08cb9646f94..e0e44dc7781 100644 --- a/doc/getting-started-guide/faq.rst +++ b/doc/getting-started-guide/faq.rst @@ -356,6 +356,25 @@ There may be situations where you need to specify the engine manually using the Some packages may have additional functionality beyond what is shown here. You can refer to the documentation for each package for more information. +How does xarray handle missing values? +-------------------------------------- + +**xarray can handle missing values using ``np.NaN``** + +- ``np.NaN`` is used to represent missing values in labeled arrays and datasets. It is a commonly used standard for representing missing or undefined numerical data in scientific computing. ``np.NaN`` is a constant value in NumPy that represents "Not a Number" or missing values. + +- Most of xarray's computation methods are designed to automatically handle missing values appropriately. + + For example, when performing operations like addition or multiplication on arrays that contain missing values, xarray will automatically ignore the missing values and only perform the operation on the valid data. This makes it easy to work with data that may contain missing or undefined values without having to worry about handling them explicitly. + +- Many of xarray's `aggregation methods `_, such as ``sum()``, ``mean()``, ``min()``, ``max()``, and others, have a skipna argument that controls whether missing values (represented by NaN) should be skipped (True) or treated as NaN (False) when performing the calculation. + + By default, ``skipna`` is set to `True`, so missing values are ignored when computing the result. However, you can set ``skipna`` to `False` if you want missing values to be treated as NaN and included in the calculation. + +- On `plotting `_ an xarray dataset or array that contains missing values, xarray will simply leave the missing values as blank spaces in the plot. + +- We have a set of `methods `_ for manipulating missing and filling values. + How should I cite xarray? ------------------------- diff --git a/doc/internals/extending-xarray.rst b/doc/internals/extending-xarray.rst index f8b61d12a2f..56aeb8fa462 100644 --- a/doc/internals/extending-xarray.rst +++ b/doc/internals/extending-xarray.rst @@ -1,6 +1,6 @@ -Extending xarray -================ +Extending xarray using accessors +================================ .. ipython:: python :suppress: @@ -8,11 +8,11 @@ Extending xarray import xarray as xr -Xarray is designed as a general purpose library, and hence tries to avoid +Xarray is designed as a general purpose library and hence tries to avoid including overly domain specific functionality. But inevitably, the need for more domain specific logic arises. -One standard solution to this problem is to subclass Dataset and/or DataArray to +One potential solution to this problem is to subclass Dataset and/or DataArray to add domain specific functionality. However, inheritance is not very robust. It's easy to inadvertently use internal APIs when subclassing, which means that your code may break when xarray upgrades. Furthermore, many builtin methods will @@ -29,7 +29,9 @@ from pandas) may suffice. To resolve this issue for more complex cases, xarray has the :py:func:`~xarray.register_dataset_accessor` and :py:func:`~xarray.register_dataarray_accessor` decorators for adding custom -"accessors" on xarray objects. Here's how you might use these decorators to +"accessors" on xarray objects, thereby "extending" the functionality of your xarray object. + +Here's how you might use these decorators to write a custom "geo" accessor implementing a geography specific extension to xarray: diff --git a/doc/user-guide/computation.rst b/doc/user-guide/computation.rst index f913ea41a91..d1f1274c7a1 100644 --- a/doc/user-guide/computation.rst +++ b/doc/user-guide/computation.rst @@ -68,28 +68,112 @@ Data arrays also implement many :py:class:`numpy.ndarray` methods: Missing values ============== +Xarray represents missing values using the "NaN" (Not a Number) value from NumPy, which is a +special floating-point value that indicates a value that is undefined or unrepresentable. +There are several methods for handling missing values in xarray: + Xarray objects borrow the :py:meth:`~xarray.DataArray.isnull`, :py:meth:`~xarray.DataArray.notnull`, :py:meth:`~xarray.DataArray.count`, :py:meth:`~xarray.DataArray.dropna`, :py:meth:`~xarray.DataArray.fillna`, :py:meth:`~xarray.DataArray.ffill`, and :py:meth:`~xarray.DataArray.bfill` methods for working with missing data from pandas: +:py:meth:`~xarray.DataArray.isnull` is a method in xarray that can be used to check for missing or null values in an xarray object. +It returns a new xarray object with the same dimensions as the original object, but with boolean values +indicating where **missing values** are present. + .. ipython:: python x = xr.DataArray([0, 1, np.nan, np.nan, 2], dims=["x"]) x.isnull() + +In this example, the third and fourth elements of 'x' are NaN, so the resulting :py:class:`~xarray.DataArray` +object has 'True' values in the third and fourth positions and 'False' values in the other positions. + +:py:meth:`~xarray.DataArray.notnull` is a method in xarray that can be used to check for non-missing or non-null values in an xarray +object. It returns a new xarray object with the same dimensions as the original object, but with boolean +values indicating where **non-missing values** are present. + +.. ipython:: python + + x = xr.DataArray([0, 1, np.nan, np.nan, 2], dims=["x"]) x.notnull() + +In this example, the first two and the last elements of x are not NaN, so the resulting +:py:class:`~xarray.DataArray` object has 'True' values in these positions, and 'False' values in the +third and fourth positions where NaN is located. + +:py:meth:`~xarray.DataArray.count` is a method in xarray that can be used to count the number of +non-missing values along one or more dimensions of an xarray object. It returns a new xarray object with +the same dimensions as the original object, but with each element replaced by the count of non-missing +values along the specified dimensions. + +.. ipython:: python + + x = xr.DataArray([0, 1, np.nan, np.nan, 2], dims=["x"]) x.count() + +In this example, 'x' has five elements, but two of them are NaN, so the resulting +:py:class:`~xarray.DataArray` object having a single element containing the value '3', which represents +the number of non-null elements in x. + +:py:meth:`~xarray.DataArray.dropna` is a method in xarray that can be used to remove missing or null values from an xarray object. +It returns a new xarray object with the same dimensions as the original object, but with missing values +removed. + +.. ipython:: python + + x = xr.DataArray([0, 1, np.nan, np.nan, 2], dims=["x"]) x.dropna(dim="x") + +In this example, on calling x.dropna(dim="x") removes any missing values and returns a new +:py:class:`~xarray.DataArray` object with only the non-null elements [0, 1, 2] of 'x', in the +original order. + +:py:meth:`~xarray.DataArray.fillna` is a method in xarray that can be used to fill missing or null values in an xarray object with a +specified value or method. It returns a new xarray object with the same dimensions as the original object, but with missing values filled. + +.. ipython:: python + + x = xr.DataArray([0, 1, np.nan, np.nan, 2], dims=["x"]) x.fillna(-1) + +In this example, there are two NaN values in 'x', so calling x.fillna(-1) replaces these values with -1 and +returns a new :py:class:`~xarray.DataArray` object with five elements, containing the values +[0, 1, -1, -1, 2] in the original order. + +:py:meth:`~xarray.DataArray.ffill` is a method in xarray that can be used to forward fill (or fill forward) missing values in an +xarray object along one or more dimensions. It returns a new xarray object with the same dimensions as the +original object, but with missing values replaced by the last non-missing value along the specified dimensions. + +.. ipython:: python + + x = xr.DataArray([0, 1, np.nan, np.nan, 2], dims=["x"]) x.ffill("x") + +In this example, there are two NaN values in 'x', so calling x.ffill("x") fills these values with the last +non-null value in the same dimension, which are 0 and 1, respectively. The resulting :py:class:`~xarray.DataArray` object has +five elements, containing the values [0, 1, 1, 1, 2] in the original order. + +:py:meth:`~xarray.DataArray.bfill` is a method in xarray that can be used to backward fill (or fill backward) missing values in an +xarray object along one or more dimensions. It returns a new xarray object with the same dimensions as the original object, but +with missing values replaced by the next non-missing value along the specified dimensions. + +.. ipython:: python + + x = xr.DataArray([0, 1, np.nan, np.nan, 2], dims=["x"]) x.bfill("x") +In this example, there are two NaN values in 'x', so calling x.bfill("x") fills these values with the next +non-null value in the same dimension, which are 2 and 2, respectively. The resulting :py:class:`~xarray.DataArray` object has +five elements, containing the values [0, 1, 2, 2, 2] in the original order. + Like pandas, xarray uses the float value ``np.nan`` (not-a-number) to represent missing values. Xarray objects also have an :py:meth:`~xarray.DataArray.interpolate_na` method -for filling missing values via 1D interpolation. +for filling missing values via 1D interpolation. It returns a new xarray object with the same dimensions +as the original object, but with missing values interpolated. .. ipython:: python @@ -100,6 +184,13 @@ for filling missing values via 1D interpolation. ) x.interpolate_na(dim="x", method="linear", use_coordinate="xx") +In this example, there are two NaN values in 'x', so calling x.interpolate_na(dim="x", method="linear", +use_coordinate="xx") fills these values with interpolated values along the "x" dimension using linear +interpolation based on the values of the xx coordinate. The resulting :py:class:`~xarray.DataArray` object has five elements, +containing the values [0., 1., 1.05, 1.45, 2.] in the original order. Note that the interpolated values +are calculated based on the values of the 'xx' coordinate, which has non-integer values, resulting in +non-integer interpolated values. + Note that xarray slightly diverges from the pandas ``interpolate`` syntax by providing the ``use_coordinate`` keyword which facilitates a clear specification of which values to use as the index in the interpolation. diff --git a/doc/user-guide/indexing.rst b/doc/user-guide/indexing.rst index 492316f898f..90b7cbaf2a9 100644 --- a/doc/user-guide/indexing.rst +++ b/doc/user-guide/indexing.rst @@ -352,7 +352,6 @@ dimensions: ind_x = xr.DataArray([0, 1], dims=["x"]) ind_y = xr.DataArray([0, 1], dims=["y"]) da[ind_x, ind_y] # orthogonal indexing - da[ind_x, ind_x] # vectorized indexing Slices or sequences/arrays without named-dimensions are treated as if they have the same dimension which is indexed along: @@ -399,6 +398,12 @@ These methods may also be applied to ``Dataset`` objects Vectorized indexing may be used to extract information from the nearest grid cells of interest, for example, the nearest climate model grid cells to a collection specified weather station latitudes and longitudes. +To trigger vectorized indexing behavior +you will need to provide the selection dimensions with a new +shared output dimension name. In the example below, the selections +of the closest latitude and longitude are renamed to an output +dimension named "points": + .. ipython:: python diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index baeb3ee3c97..dc495b9f285 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -254,31 +254,22 @@ You can view this encoding information (among others) in the :py:attr:`DataArray.encoding` and :py:attr:`DataArray.encoding` attributes: -.. ipython:: - :verbatim: +.. ipython:: python - In [1]: ds_disk["y"].encoding - Out[1]: - {'zlib': False, - 'shuffle': False, - 'complevel': 0, - 'fletcher32': False, - 'contiguous': True, - 'chunksizes': None, - 'source': 'saved_on_disk.nc', - 'original_shape': (5,), - 'dtype': dtype('int64'), - 'units': 'days since 2000-01-01 00:00:00', - 'calendar': 'proleptic_gregorian'} - - In [9]: ds_disk.encoding - Out[9]: - {'unlimited_dims': set(), - 'source': 'saved_on_disk.nc'} + ds_disk["y"].encoding + ds_disk.encoding Note that all operations that manipulate variables other than indexing will remove encoding information. +In some cases it is useful to intentionally reset a dataset's original encoding values. +This can be done with either the :py:meth:`Dataset.reset_encoding` or +:py:meth:`DataArray.reset_encoding` methods. + +.. ipython:: python + + ds_no_encoding = ds_disk.reset_encoding() + ds_no_encoding.encoding .. _combining multiple files: @@ -617,6 +608,13 @@ store is already present at that path, an error will be raised, preventing it from being overwritten. To override this behavior and overwrite an existing store, add ``mode='w'`` when invoking :py:meth:`~Dataset.to_zarr`. +DataArrays can also be saved to disk using the :py:meth:`DataArray.to_zarr` method, +and loaded from disk using the :py:func:`open_dataarray` function with `engine='zarr'`. +Similar to :py:meth:`DataArray.to_netcdf`, :py:meth:`DataArray.to_zarr` will +convert the ``DataArray`` to a ``Dataset`` before saving, and then convert back +when loading, ensuring that the ``DataArray`` that is loaded is always exactly +the same as the one that was saved. + .. note:: xarray does not write NCZarr attributes. Therefore, NCZarr data must be diff --git a/doc/user-guide/reshaping.rst b/doc/user-guide/reshaping.rst index 95bf21a71b0..2281106e7ec 100644 --- a/doc/user-guide/reshaping.rst +++ b/doc/user-guide/reshaping.rst @@ -4,7 +4,12 @@ Reshaping and reorganizing data ############################### -These methods allow you to reorganize your data by changing dimensions, array shape, order of values, or indexes. +Reshaping and reorganizing data refers to the process of changing the structure or organization of data by modifying dimensions, array shapes, order of values, or indexes. Xarray provides several methods to accomplish these tasks. + +These methods are particularly useful for reshaping xarray objects for use in machine learning packages, such as scikit-learn, that usually require two-dimensional numpy arrays as inputs. Reshaping can also be required before passing data to external visualization tools, for example geospatial data might expect input organized into a particular format corresponding to stacks of satellite images. + +Importing the library +--------------------- .. ipython:: python :suppress: diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index d2e15adeba7..54d5dd764ae 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -225,6 +225,13 @@ resampling group: ds.resample(time="6H").reduce(np.mean) +You can also resample on the time dimension while applying reducing along other dimensions at the same time +by specifying the `dim` keyword argument + +.. code-block:: python + + ds.resample(time="6H").mean(dim=["time", "latitude", "longitude"]) + For upsampling, xarray provides six methods: ``asfreq``, ``ffill``, ``bfill``, ``pad``, ``nearest`` and ``interpolate``. ``interpolate`` extends ``scipy.interpolate.interp1d`` and supports all of its schemes. All of these resampling operations work on both diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index 30876eb36bc..e08784b3e09 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -57,14 +57,14 @@ CF-compliant coordinate variables .. _CFTimeIndex: -Non-standard calendars and dates outside the Timestamp-valid range ------------------------------------------------------------------- +Non-standard calendars and dates outside the nanosecond-precision range +----------------------------------------------------------------------- Through the standalone ``cftime`` library and a custom subclass of :py:class:`pandas.Index`, xarray supports a subset of the indexing functionality enabled through the standard :py:class:`pandas.DatetimeIndex` for dates from non-standard calendars commonly used in climate science or dates -using a standard calendar, but outside the `Timestamp-valid range`_ +using a standard calendar, but outside the `nanosecond-precision range`_ (approximately between years 1678 and 2262). .. note:: @@ -75,13 +75,19 @@ using a standard calendar, but outside the `Timestamp-valid range`_ any of the following are true: - The dates are from a non-standard calendar - - Any dates are outside the Timestamp-valid range. + - Any dates are outside the nanosecond-precision range. Otherwise pandas-compatible dates from a standard calendar will be represented with the ``np.datetime64[ns]`` data type, enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[ns]`` and their full set of associated features. + As of pandas version 2.0.0, pandas supports non-nanosecond precision datetime + values. For the time being, xarray still automatically casts datetime values + to nanosecond-precision for backwards compatibility with older pandas + versions; however, this is something we would like to relax going forward. + See :issue:`7493` for more discussion. + For example, you can create a DataArray indexed by a time coordinate with dates from a no-leap calendar and a :py:class:`~xarray.CFTimeIndex` will automatically be used: @@ -235,6 +241,6 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: da.resample(time="81T", closed="right", label="right", offset="3T").mean() -.. _Timestamp-valid range: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations +.. _nanosecond-precision range: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations .. _ISO 8601 standard: https://en.wikipedia.org/wiki/ISO_8601 .. _partial datetime string indexing: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#partial-string-indexing diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d07667ca101..498a8b5b59c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -15,14 +15,44 @@ What's New np.random.seed(123456) + +.. _whats-new.2023.04.1: + +v2023.04.1 (April 18, 2023) +--------------------------- + +This is a patch release to fix a bug with binning (:issue:`7759`) + +Bug fixes +~~~~~~~~~ + +- Fix binning by unsorted arrays. (:issue:`7759`) + + .. _whats-new.2023.04.0: -v2023.04.0 (unreleased) ------------------------ +v2023.04.0 (April 14, 2023) +--------------------------- + +This release includes support for pandas v2, allows refreshing of backend engines in a session, and removes deprecated backends +for ``rasterio`` and ``cfgrib``. + +Thanks to our 19 contributors: +Chinemere, Tom Coleman, Deepak Cherian, Harshitha, Illviljan, Jessica Scheick, Joe Hamman, Justus Magin, Kai Mühlbauer, Kwonil-Kim, Mary Gathoni, Michael Niklas, Pierre, Scott Henderson, Shreyal Gupta, Spencer Clark, mccloskey, nishtha981, veenstrajelmer + +We welcome the following new contributors to Xarray!: +Mary Gathoni, Harshitha, veenstrajelmer, Chinemere, nishtha981, Shreyal Gupta, Kwonil-Kim, mccloskey. New Features ~~~~~~~~~~~~ - +- New methods to reset an objects encoding (:py:meth:`Dataset.reset_encoding`, :py:meth:`DataArray.reset_encoding`). + (:issue:`7686`, :pull:`7689`). + By `Joe Hamman `_. +- Allow refreshing backend engines with :py:meth:`xarray.backends.refresh_engines` (:issue:`7478`, :pull:`7523`). + By `Michael Niklas `_. +- Added ability to save ``DataArray`` objects directly to Zarr using :py:meth:`~xarray.DataArray.to_zarr`. + (:issue:`7692`, :pull:`7693`) . + By `Joe Hamman `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -32,6 +62,12 @@ Breaking changes Deprecations ~~~~~~~~~~~~ +Performance +~~~~~~~~~~~ +- Optimize alignment with ``join="exact", copy=False`` by avoiding copies. (:pull:`7736`) + By `Deepak Cherian `_. +- Avoid unnecessary copies of ``CFTimeIndex``. (:pull:`7735`) + By `Deepak Cherian `_. Bug fixes ~~~~~~~~~ @@ -42,6 +78,15 @@ Bug fixes By `Thomas Coleman `_. - Proper plotting when passing :py:class:`~matplotlib.colors.BoundaryNorm` type argument in :py:meth:`DataArray.plot`. (:issue:`4061`, :issue:`7014`,:pull:`7553`) By `Jelmer Veenstra `_. +- Ensure the formatting of time encoding reference dates outside the range of + nanosecond-precision datetimes remains the same under pandas version 2.0.0 + (:issue:`7420`, :pull:`7441`). + By `Justus Magin `_ and + `Spencer Clark `_. +- Various `dtype` related fixes needed to support `pandas>=2.0` (:pull:`7724`) + By `Justus Magin `_. +- Preserve boolean dtype within encoding (:issue:`7652`, :pull:`7720`). + By `Kai Mühlbauer `_ Documentation ~~~~~~~~~~~~~ @@ -52,9 +97,25 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Don't assume that arrays read from disk will be Numpy arrays. This is a step toward + enabling reads from a Zarr store using the `Kvikio `_ + or `TensorStore `_ libraries. + (:pull:`6874`). By `Deepak Cherian `_. + - Remove internal support for reading GRIB files through the ``cfgrib`` backend. ``cfgrib`` now uses the external backend interface, so no existing code should break. By `Deepak Cherian `_. +- Implement CF coding functions in ``VariableCoders`` (:pull:`7719`). + By `Kai Mühlbauer `_ + +- Added a config.yml file with messages for the welcome bot when a Github user creates their first ever issue or pull request or has their first PR merged. (:issue:`7685`, :pull:`7685`) + By `Nishtha P `_. + +- Ensure that only nanosecond-precision :py:class:`pd.Timestamp` objects + continue to be used internally under pandas version 2.0.0. This is mainly to + ease the transition to this latest version of pandas. It should be relaxed + when addressing :issue:`7493`. By `Spencer Clark + `_ (:issue:`7707`, :pull:`7731`). .. _whats-new.2023.03.0: diff --git a/setup.cfg b/setup.cfg index c0dd5ff9595..81b7f1c4a0e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -76,7 +76,7 @@ include_package_data = True python_requires = >=3.9 install_requires = numpy >= 1.21 # recommended to use >= 1.22 for full quantile method support - pandas >= 1.4, <2 + pandas >= 1.4 packaging >= 21.3 [options.extras_require] diff --git a/xarray/backends/__init__.py b/xarray/backends/__init__.py index ca0b8fe4e6b..cf27998b6fb 100644 --- a/xarray/backends/__init__.py +++ b/xarray/backends/__init__.py @@ -12,7 +12,7 @@ from xarray.backends.h5netcdf_ import H5netcdfBackendEntrypoint, H5NetCDFStore from xarray.backends.memory import InMemoryDataStore from xarray.backends.netCDF4_ import NetCDF4BackendEntrypoint, NetCDF4DataStore -from xarray.backends.plugins import list_engines +from xarray.backends.plugins import list_engines, refresh_engines from xarray.backends.pseudonetcdf_ import ( PseudoNetCDFBackendEntrypoint, PseudoNetCDFDataStore, @@ -46,4 +46,5 @@ "StoreBackendEntrypoint", "ZarrBackendEntrypoint", "list_engines", + "refresh_engines", ] diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 050493e3034..bca8b7f668a 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -17,6 +17,8 @@ if TYPE_CHECKING: from io import BufferedIOBase + from xarray.core.dataset import Dataset + # Create a logger object, but don't add any handlers. Leave that to user code. logger = logging.getLogger(__name__) @@ -84,9 +86,9 @@ def robust_getitem(array, key, catch=Exception, max_retries=6, initial_delay=500 class BackendArray(NdimSizeLenMixin, indexing.ExplicitlyIndexed): __slots__ = () - def __array__(self, dtype=None): + def get_duck_array(self, dtype: np.typing.DTypeLike = None): key = indexing.BasicIndexer((slice(None),) * self.ndim) - return np.asarray(self[key], dtype=dtype) + return self[key] # type: ignore [index] class AbstractDataStore: @@ -377,9 +379,6 @@ class BackendEntrypoint: Attributes ---------- - available : bool, default: True - Indicate wether this backend is available given the installed packages. - The setting of this attribute is not mandatory. open_dataset_parameters : tuple, default: None A list of ``open_dataset`` method parameters. The setting of this attribute is not mandatory. @@ -391,8 +390,6 @@ class BackendEntrypoint: The setting of this attribute is not mandatory. """ - available: ClassVar[bool] = True - open_dataset_parameters: ClassVar[tuple | None] = None description: ClassVar[str] = "" url: ClassVar[str] = "" @@ -408,9 +405,10 @@ def __repr__(self) -> str: def open_dataset( self, filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + *, drop_variables: str | Iterable[str] | None = None, **kwargs: Any, - ): + ) -> Dataset: """ Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`. """ @@ -420,7 +418,7 @@ def open_dataset( def guess_can_open( self, filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, - ): + ) -> bool: """ Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`. """ @@ -428,4 +426,5 @@ def guess_can_open( return False -BACKEND_ENTRYPOINTS: dict[str, type[BackendEntrypoint]] = {} +# mapping of engine name to (module name, BackendEntrypoint Class) +BACKEND_ENTRYPOINTS: dict[str, tuple[str | None, type[BackendEntrypoint]]] = {} diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index c4f75672173..7389f6a2862 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -3,6 +3,8 @@ import functools import io import os +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any from packaging.version import Version @@ -27,12 +29,17 @@ from xarray.core.utils import ( FrozenDict, is_remote_uri, - module_available, read_magic_number_from_file, try_read_magic_number_from_file_or_path, ) from xarray.core.variable import Variable +if TYPE_CHECKING: + from io import BufferedIOBase + + from xarray.backends.common import AbstractDataStore + from xarray.core.dataset import Dataset + class H5NetCDFArrayWrapper(BaseNetCDF4Array): def get_array(self, needs_lock=True): @@ -365,33 +372,34 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): backends.ScipyBackendEntrypoint """ - available = module_available("h5netcdf") description = ( "Open netCDF (.nc, .nc4 and .cdf) and most HDF5 files using h5netcdf in Xarray" ) url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.H5netcdfBackendEntrypoint.html" - def guess_can_open(self, filename_or_obj): + def guess_can_open( + self, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + ) -> bool: magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None: return magic_number.startswith(b"\211HDF\r\n\032\n") - try: + if isinstance(filename_or_obj, (str, os.PathLike)): _, ext = os.path.splitext(filename_or_obj) - except TypeError: - return False + return ext in {".nc", ".nc4", ".cdf"} - return ext in {".nc", ".nc4", ".cdf"} + return False - def open_dataset( + def open_dataset( # type: ignore[override] # allow LSP violation, not supporting **kwargs self, - filename_or_obj, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, + drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, format=None, @@ -400,7 +408,7 @@ def open_dataset( invalid_netcdf=None, phony_dims=None, decode_vlen_strings=True, - ): + ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) store = H5NetCDFStore.open( filename_or_obj, @@ -427,4 +435,4 @@ def open_dataset( return ds -BACKEND_ENTRYPOINTS["h5netcdf"] = H5netcdfBackendEntrypoint +BACKEND_ENTRYPOINTS["h5netcdf"] = ("h5netcdf", H5netcdfBackendEntrypoint) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 0c6e083158d..d3866e90de6 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -3,7 +3,9 @@ import functools import operator import os +from collections.abc import Iterable from contextlib import suppress +from typing import TYPE_CHECKING, Any import numpy as np @@ -33,11 +35,16 @@ FrozenDict, close_on_error, is_remote_uri, - module_available, try_read_magic_number_from_path, ) from xarray.core.variable import Variable +if TYPE_CHECKING: + from io import BufferedIOBase + + from xarray.backends.common import AbstractDataStore + from xarray.core.dataset import Dataset + # This lookup table maps from dtype.byteorder to a readable endian # string used by netCDF4. _endian_lookup = {"=": "native", ">": "big", "<": "little", "|": "native"} @@ -535,33 +542,37 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint): backends.ScipyBackendEntrypoint """ - available = module_available("netCDF4") description = ( "Open netCDF (.nc, .nc4 and .cdf) and most HDF5 files using netCDF4 in Xarray" ) url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.NetCDF4BackendEntrypoint.html" - def guess_can_open(self, filename_or_obj): + def guess_can_open( + self, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + ) -> bool: if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): return True magic_number = try_read_magic_number_from_path(filename_or_obj) if magic_number is not None: # netcdf 3 or HDF5 return magic_number.startswith((b"CDF", b"\211HDF\r\n\032\n")) - try: + + if isinstance(filename_or_obj, (str, os.PathLike)): _, ext = os.path.splitext(filename_or_obj) - except TypeError: - return False - return ext in {".nc", ".nc4", ".cdf"} + return ext in {".nc", ".nc4", ".cdf"} + + return False - def open_dataset( + def open_dataset( # type: ignore[override] # allow LSP violation, not supporting **kwargs self, - filename_or_obj, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + *, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, + drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, group=None, @@ -572,7 +583,7 @@ def open_dataset( persist=False, lock=None, autoclose=False, - ): + ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) store = NetCDF4DataStore.open( filename_or_obj, @@ -601,4 +612,4 @@ def open_dataset( return ds -BACKEND_ENTRYPOINTS["netcdf4"] = NetCDF4BackendEntrypoint +BACKEND_ENTRYPOINTS["netcdf4"] = ("netCDF4", NetCDF4BackendEntrypoint) diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py index d6ad6dfbe18..232c2300192 100644 --- a/xarray/backends/plugins.py +++ b/xarray/backends/plugins.py @@ -6,12 +6,19 @@ import sys import warnings from importlib.metadata import entry_points -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Callable from xarray.backends.common import BACKEND_ENTRYPOINTS, BackendEntrypoint +from xarray.core.utils import module_available if TYPE_CHECKING: import os + from importlib.metadata import EntryPoint + + if sys.version_info >= (3, 10): + from importlib.metadata import EntryPoints + else: + EntryPoints = list[EntryPoint] from io import BufferedIOBase from xarray.backends.common import AbstractDataStore @@ -19,15 +26,15 @@ STANDARD_BACKENDS_ORDER = ["netcdf4", "h5netcdf", "scipy"] -def remove_duplicates(entrypoints): +def remove_duplicates(entrypoints: EntryPoints) -> list[EntryPoint]: # sort and group entrypoints by name - entrypoints = sorted(entrypoints, key=lambda ep: ep.name) - entrypoints_grouped = itertools.groupby(entrypoints, key=lambda ep: ep.name) + entrypoints_sorted = sorted(entrypoints, key=lambda ep: ep.name) + entrypoints_grouped = itertools.groupby(entrypoints_sorted, key=lambda ep: ep.name) # check if there are multiple entrypoints for the same name unique_entrypoints = [] - for name, matches in entrypoints_grouped: + for name, _matches in entrypoints_grouped: # remove equal entrypoints - matches = list(set(matches)) + matches = list(set(_matches)) unique_entrypoints.append(matches[0]) matches_len = len(matches) if matches_len > 1: @@ -42,7 +49,7 @@ def remove_duplicates(entrypoints): return unique_entrypoints -def detect_parameters(open_dataset): +def detect_parameters(open_dataset: Callable) -> tuple[str, ...]: signature = inspect.signature(open_dataset) parameters = signature.parameters parameters_list = [] @@ -60,7 +67,9 @@ def detect_parameters(open_dataset): return tuple(parameters_list) -def backends_dict_from_pkg(entrypoints): +def backends_dict_from_pkg( + entrypoints: list[EntryPoint], +) -> dict[str, type[BackendEntrypoint]]: backend_entrypoints = {} for entrypoint in entrypoints: name = entrypoint.name @@ -72,14 +81,18 @@ def backends_dict_from_pkg(entrypoints): return backend_entrypoints -def set_missing_parameters(backend_entrypoints): - for name, backend in backend_entrypoints.items(): +def set_missing_parameters( + backend_entrypoints: dict[str, type[BackendEntrypoint]] +) -> None: + for _, backend in backend_entrypoints.items(): if backend.open_dataset_parameters is None: open_dataset = backend.open_dataset backend.open_dataset_parameters = detect_parameters(open_dataset) -def sort_backends(backend_entrypoints): +def sort_backends( + backend_entrypoints: dict[str, type[BackendEntrypoint]] +) -> dict[str, type[BackendEntrypoint]]: ordered_backends_entrypoints = {} for be_name in STANDARD_BACKENDS_ORDER: if backend_entrypoints.get(be_name, None) is not None: @@ -90,13 +103,13 @@ def sort_backends(backend_entrypoints): return ordered_backends_entrypoints -def build_engines(entrypoints) -> dict[str, BackendEntrypoint]: - backend_entrypoints = {} - for backend_name, backend in BACKEND_ENTRYPOINTS.items(): - if backend.available: +def build_engines(entrypoints: EntryPoints) -> dict[str, BackendEntrypoint]: + backend_entrypoints: dict[str, type[BackendEntrypoint]] = {} + for backend_name, (module_name, backend) in BACKEND_ENTRYPOINTS.items(): + if module_name is None or module_available(module_name): backend_entrypoints[backend_name] = backend - entrypoints = remove_duplicates(entrypoints) - external_backend_entrypoints = backends_dict_from_pkg(entrypoints) + entrypoints_unique = remove_duplicates(entrypoints) + external_backend_entrypoints = backends_dict_from_pkg(entrypoints_unique) backend_entrypoints.update(external_backend_entrypoints) backend_entrypoints = sort_backends(backend_entrypoints) set_missing_parameters(backend_entrypoints) @@ -122,10 +135,15 @@ def list_engines() -> dict[str, BackendEntrypoint]: if sys.version_info >= (3, 10): entrypoints = entry_points(group="xarray.backends") else: - entrypoints = entry_points().get("xarray.backends", ()) + entrypoints = entry_points().get("xarray.backends", []) return build_engines(entrypoints) +def refresh_engines() -> None: + """Refreshes the backend engines based on installed packages.""" + list_engines.cache_clear() + + def guess_engine( store_spec: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, ): @@ -141,7 +159,7 @@ def guess_engine( warnings.warn(f"{engine!r} fails while guessing", RuntimeWarning) compatible_engines = [] - for engine, backend_cls in BACKEND_ENTRYPOINTS.items(): + for engine, (_, backend_cls) in BACKEND_ENTRYPOINTS.items(): try: backend = backend_cls() if backend.guess_can_open(store_spec): diff --git a/xarray/backends/pseudonetcdf_.py b/xarray/backends/pseudonetcdf_.py index ae8f90e3a44..71cdd3199e0 100644 --- a/xarray/backends/pseudonetcdf_.py +++ b/xarray/backends/pseudonetcdf_.py @@ -1,5 +1,8 @@ from __future__ import annotations +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any + import numpy as np from xarray.backends.common import ( @@ -13,9 +16,15 @@ from xarray.backends.locks import HDF5_LOCK, NETCDFC_LOCK, combine_locks, ensure_lock from xarray.backends.store import StoreBackendEntrypoint from xarray.core import indexing -from xarray.core.utils import Frozen, FrozenDict, close_on_error, module_available +from xarray.core.utils import Frozen, FrozenDict, close_on_error from xarray.core.variable import Variable +if TYPE_CHECKING: + import os + from io import BufferedIOBase + + from xarray.core.dataset import Dataset + # psuedonetcdf can invoke netCDF libraries internally PNETCDF_LOCK = combine_locks([HDF5_LOCK, NETCDFC_LOCK]) @@ -121,7 +130,6 @@ class PseudoNetCDFBackendEntrypoint(BackendEntrypoint): backends.PseudoNetCDFDataStore """ - available = module_available("PseudoNetCDF") description = ( "Open many atmospheric science data formats using PseudoNetCDF in Xarray" ) @@ -144,18 +152,18 @@ class PseudoNetCDFBackendEntrypoint(BackendEntrypoint): def open_dataset( self, - filename_or_obj, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, mask_and_scale=False, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, + drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, mode=None, lock=None, **format_kwargs, - ): + ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) store = PseudoNetCDFDataStore.open( filename_or_obj, lock=lock, mode=mode, **format_kwargs @@ -176,4 +184,4 @@ def open_dataset( return ds -BACKEND_ENTRYPOINTS["pseudonetcdf"] = PseudoNetCDFBackendEntrypoint +BACKEND_ENTRYPOINTS["pseudonetcdf"] = ("PseudoNetCDF", PseudoNetCDFBackendEntrypoint) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index df26a03d790..116c48f5692 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -1,5 +1,8 @@ from __future__ import annotations +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any + import numpy as np from packaging.version import Version @@ -19,10 +22,15 @@ close_on_error, is_dict_like, is_remote_uri, - module_available, ) from xarray.core.variable import Variable +if TYPE_CHECKING: + import os + from io import BufferedIOBase + + from xarray.core.dataset import Dataset + class PydapArrayWrapper(BackendArray): def __init__(self, array): @@ -46,6 +54,7 @@ def _getitem(self, key): # downloading coordinate data twice array = getattr(self.array, "array", self.array) result = robust_getitem(array, key, catch=ValueError) + result = np.asarray(result) # in some cases, pydap doesn't squeeze axes automatically like numpy axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) if result.ndim + len(axis) != array.ndim and axis: @@ -154,21 +163,24 @@ class PydapBackendEntrypoint(BackendEntrypoint): backends.PydapDataStore """ - available = module_available("pydap") description = "Open remote datasets via OPeNDAP using pydap in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.PydapBackendEntrypoint.html" - def guess_can_open(self, filename_or_obj): + def guess_can_open( + self, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + ) -> bool: return isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) - def open_dataset( + def open_dataset( # type: ignore[override] # allow LSP violation, not supporting **kwargs self, - filename_or_obj, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + *, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, + drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, application=None, @@ -177,7 +189,7 @@ def open_dataset( timeout=None, verify=None, user_charset=None, - ): + ) -> Dataset: store = PydapDataStore.open( url=filename_or_obj, application=application, @@ -203,4 +215,4 @@ def open_dataset( return ds -BACKEND_ENTRYPOINTS["pydap"] = PydapBackendEntrypoint +BACKEND_ENTRYPOINTS["pydap"] = ("pydap", PydapBackendEntrypoint) diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 611ea978990..75e96ffdc0a 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -1,6 +1,8 @@ from __future__ import annotations import warnings +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any import numpy as np @@ -21,9 +23,15 @@ ) from xarray.backends.store import StoreBackendEntrypoint from xarray.core import indexing -from xarray.core.utils import Frozen, FrozenDict, close_on_error, module_available +from xarray.core.utils import Frozen, FrozenDict, close_on_error from xarray.core.variable import Variable +if TYPE_CHECKING: + import os + from io import BufferedIOBase + + from xarray.core.dataset import Dataset + # PyNIO can invoke netCDF libraries internally # Add a dedicated lock just in case NCL as well isn't thread-safe. NCL_LOCK = SerializableLock() @@ -117,21 +125,20 @@ class PynioBackendEntrypoint(BackendEntrypoint): https://github.com/pydata/xarray/issues/4491 for more information """ - available = module_available("Nio") - - def open_dataset( + def open_dataset( # type: ignore[override] # allow LSP violation, not supporting **kwargs self, - filename_or_obj, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + *, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, + drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, mode="r", lock=None, - ): + ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) store = NioDataStore( filename_or_obj, @@ -154,4 +161,4 @@ def open_dataset( return ds -BACKEND_ENTRYPOINTS["pynio"] = PynioBackendEntrypoint +BACKEND_ENTRYPOINTS["pynio"] = ("Nio", PynioBackendEntrypoint) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 651aebce2ce..1ecc70cf376 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -3,6 +3,8 @@ import gzip import io import os +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any import numpy as np @@ -26,11 +28,16 @@ Frozen, FrozenDict, close_on_error, - module_available, try_read_magic_number_from_file_or_path, ) from xarray.core.variable import Variable +if TYPE_CHECKING: + from io import BufferedIOBase + + from xarray.backends.common import AbstractDataStore + from xarray.core.dataset import Dataset + def _decode_string(s): if isinstance(s, bytes): @@ -261,32 +268,35 @@ class ScipyBackendEntrypoint(BackendEntrypoint): backends.H5netcdfBackendEntrypoint """ - available = module_available("scipy") description = "Open netCDF files (.nc, .nc4, .cdf and .gz) using scipy in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.ScipyBackendEntrypoint.html" - def guess_can_open(self, filename_or_obj): + def guess_can_open( + self, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + ) -> bool: magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None and magic_number.startswith(b"\x1f\x8b"): - with gzip.open(filename_or_obj) as f: + with gzip.open(filename_or_obj) as f: # type: ignore[arg-type] magic_number = try_read_magic_number_from_file_or_path(f) if magic_number is not None: return magic_number.startswith(b"CDF") - try: + if isinstance(filename_or_obj, (str, os.PathLike)): _, ext = os.path.splitext(filename_or_obj) - except TypeError: - return False - return ext in {".nc", ".nc4", ".cdf", ".gz"} + return ext in {".nc", ".nc4", ".cdf", ".gz"} + + return False - def open_dataset( + def open_dataset( # type: ignore[override] # allow LSP violation, not supporting **kwargs self, - filename_or_obj, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + *, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, + drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, mode="r", @@ -294,7 +304,7 @@ def open_dataset( group=None, mmap=None, lock=None, - ): + ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) store = ScipyDataStore( filename_or_obj, mode=mode, format=format, group=group, mmap=mmap, lock=lock @@ -315,4 +325,4 @@ def open_dataset( return ds -BACKEND_ENTRYPOINTS["scipy"] = ScipyBackendEntrypoint +BACKEND_ENTRYPOINTS["scipy"] = ("scipy", ScipyBackendEntrypoint) diff --git a/xarray/backends/store.py b/xarray/backends/store.py index 1f7a44bf4dc..a507ee37470 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -1,5 +1,8 @@ from __future__ import annotations +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any + from xarray import conventions from xarray.backends.common import ( BACKEND_ENTRYPOINTS, @@ -8,29 +11,37 @@ ) from xarray.core.dataset import Dataset +if TYPE_CHECKING: + import os + from io import BufferedIOBase + class StoreBackendEntrypoint(BackendEntrypoint): - available = True description = "Open AbstractDataStore instances in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.StoreBackendEntrypoint.html" - def guess_can_open(self, filename_or_obj): + def guess_can_open( + self, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + ) -> bool: return isinstance(filename_or_obj, AbstractDataStore) - def open_dataset( + def open_dataset( # type: ignore[override] # allow LSP violation, not supporting **kwargs self, - store, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, + drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, - ): - vars, attrs = store.load() - encoding = store.get_encoding() + ) -> Dataset: + assert isinstance(filename_or_obj, AbstractDataStore) + + vars, attrs = filename_or_obj.load() + encoding = filename_or_obj.get_encoding() vars, attrs, coord_names = conventions.decode_cf_variables( vars, @@ -46,10 +57,10 @@ def open_dataset( ds = Dataset(vars, attrs=attrs) ds = ds.set_coords(coord_names.intersection(vars)) - ds.set_close(store.close) + ds.set_close(filename_or_obj.close) ds.encoding = encoding return ds -BACKEND_ENTRYPOINTS["store"] = StoreBackendEntrypoint +BACKEND_ENTRYPOINTS["store"] = (None, StoreBackendEntrypoint) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index bc251d05631..7d21c771e06 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -3,6 +3,8 @@ import json import os import warnings +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any import numpy as np @@ -22,10 +24,16 @@ FrozenDict, HiddenKeyDict, close_on_error, - module_available, ) from xarray.core.variable import Variable +if TYPE_CHECKING: + from io import BufferedIOBase + + from xarray.backends.common import AbstractDataStore + from xarray.core.dataset import Dataset + + # need some special secret attributes to tell us the dimensions DIMENSION_KEY = "_ARRAY_DIMENSIONS" @@ -863,25 +871,28 @@ class ZarrBackendEntrypoint(BackendEntrypoint): backends.ZarrStore """ - available = module_available("zarr") description = "Open zarr files (.zarr) using zarr in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.ZarrBackendEntrypoint.html" - def guess_can_open(self, filename_or_obj): - try: + def guess_can_open( + self, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + ) -> bool: + if isinstance(filename_or_obj, (str, os.PathLike)): _, ext = os.path.splitext(filename_or_obj) - except TypeError: - return False - return ext in {".zarr"} + return ext in {".zarr"} + + return False - def open_dataset( + def open_dataset( # type: ignore[override] # allow LSP violation, not supporting **kwargs self, - filename_or_obj, + filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + *, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, + drop_variables: str | Iterable[str] | None = None, use_cftime=None, decode_timedelta=None, group=None, @@ -892,7 +903,7 @@ def open_dataset( storage_options=None, stacklevel=3, zarr_version=None, - ): + ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) store = ZarrStore.open_group( filename_or_obj, @@ -922,4 +933,4 @@ def open_dataset( return ds -BACKEND_ENTRYPOINTS["zarr"] = ZarrBackendEntrypoint +BACKEND_ENTRYPOINTS["zarr"] = ("zarr", ZarrBackendEntrypoint) diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 792724ecc79..a746163c3fd 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -57,7 +57,12 @@ format_cftime_datetime, ) from xarray.core.common import _contains_datetime_like_objects, is_np_datetime_like -from xarray.core.pdcompat import NoDefault, count_not_none, no_default +from xarray.core.pdcompat import ( + NoDefault, + count_not_none, + nanosecond_precision_timestamp, + no_default, +) from xarray.core.utils import emit_user_level_warning try: @@ -1286,8 +1291,10 @@ def date_range_like(source, calendar, use_cftime=None): if is_np_datetime_like(source.dtype): # We want to use datetime fields (datetime64 object don't have them) source_calendar = "standard" - source_start = pd.Timestamp(source_start) - source_end = pd.Timestamp(source_end) + # TODO: the strict enforcement of nanosecond precision Timestamps can be + # relaxed when addressing GitHub issue #7493. + source_start = nanosecond_precision_timestamp(source_start) + source_end = nanosecond_precision_timestamp(source_end) else: if isinstance(source, CFTimeIndex): source_calendar = source.calendar diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 7227ba9edb6..c6a7b9f8763 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -613,7 +613,7 @@ def to_datetimeindex(self, unsafe=False): ------ ValueError If the CFTimeIndex contains dates that are not possible in the - standard calendar or outside the pandas.Timestamp-valid range. + standard calendar or outside the nanosecond-precision range. Warns ----- diff --git a/xarray/coding/times.py b/xarray/coding/times.py index f9e79863d46..3745d61acc0 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -23,6 +23,7 @@ from xarray.core import indexing from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like from xarray.core.formatting import first_n_items, format_timestamp, last_item +from xarray.core.pdcompat import nanosecond_precision_timestamp from xarray.core.pycompat import is_duck_dask_array from xarray.core.variable import Variable @@ -224,7 +225,9 @@ def _decode_datetime_with_pandas( delta, ref_date = _unpack_netcdf_time_units(units) delta = _netcdf_to_numpy_timeunit(delta) try: - ref_date = pd.Timestamp(ref_date) + # TODO: the strict enforcement of nanosecond precision Timestamps can be + # relaxed when addressing GitHub issue #7493. + ref_date = nanosecond_precision_timestamp(ref_date) except ValueError: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime @@ -391,7 +394,9 @@ def infer_datetime_units(dates) -> str: dates = to_datetime_unboxed(dates) dates = dates[pd.notnull(dates)] reference_date = dates[0] if len(dates) > 0 else "1970-01-01" - reference_date = pd.Timestamp(reference_date) + # TODO: the strict enforcement of nanosecond precision Timestamps can be + # relaxed when addressing GitHub issue #7493. + reference_date = nanosecond_precision_timestamp(reference_date) else: reference_date = dates[0] if len(dates) > 0 else "1970-01-01" reference_date = format_cftime_datetime(reference_date) @@ -432,6 +437,8 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: If raise_on_invalid is True (default), invalid dates trigger a ValueError. Otherwise, the invalid element is replaced by np.NaT.""" times = np.asarray(times) + # TODO: the strict enforcement of nanosecond precision datetime values can + # be relaxed when addressing GitHub issue #7493. new = np.empty(times.shape, dtype="M8[ns]") for i, t in np.ndenumerate(times): try: @@ -439,7 +446,7 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: # NumPy casts it safely it np.datetime64[ns] for dates outside # 1678 to 2262 (this is not currently the case for # datetime.datetime). - dt = pd.Timestamp( + dt = nanosecond_precision_timestamp( t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond ) except ValueError as e: @@ -498,6 +505,10 @@ def convert_time_or_go_back(date, date_type): This is meant to convert end-of-month dates into a new calendar. """ + # TODO: the strict enforcement of nanosecond precision Timestamps can be + # relaxed when addressing GitHub issue #7493. + if date_type == pd.Timestamp: + date_type = nanosecond_precision_timestamp try: return date_type( date.year, @@ -641,7 +652,10 @@ def encode_cf_datetime( delta_units = _netcdf_to_numpy_timeunit(delta) time_delta = np.timedelta64(1, delta_units).astype("timedelta64[ns]") - ref_date = pd.Timestamp(_ref_date) + + # TODO: the strict enforcement of nanosecond precision Timestamps can be + # relaxed when addressing GitHub issue #7493. + ref_date = nanosecond_precision_timestamp(_ref_date) # If the ref_date Timestamp is timezone-aware, convert to UTC and # make it timezone-naive (GH 2649). diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index c290307b4b6..5c6e51c2215 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -69,8 +69,8 @@ def dtype(self) -> np.dtype: def __getitem__(self, key): return type(self)(self.array[key], self.func, self.dtype) - def __array__(self, dtype=None): - return self.func(self.array) + def get_duck_array(self): + return self.func(self.array.get_duck_array()) def __repr__(self) -> str: return "{}({!r}, func={!r}, dtype={!r})".format( @@ -78,6 +78,71 @@ def __repr__(self) -> str: ) +class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): + """Decode arrays on the fly from non-native to native endianness + + This is useful for decoding arrays from netCDF3 files (which are all + big endian) into native endianness, so they can be used with Cython + functions, such as those found in bottleneck and pandas. + + >>> x = np.arange(5, dtype=">i2") + + >>> x.dtype + dtype('>i2') + + >>> NativeEndiannessArray(x).dtype + dtype('int16') + + >>> indexer = indexing.BasicIndexer((slice(None),)) + >>> NativeEndiannessArray(x)[indexer].dtype + dtype('int16') + """ + + __slots__ = ("array",) + + def __init__(self, array) -> None: + self.array = indexing.as_indexable(array) + + @property + def dtype(self) -> np.dtype: + return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) + + def __getitem__(self, key) -> np.ndarray: + return np.asarray(self.array[key], dtype=self.dtype) + + +class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): + """Decode arrays on the fly from integer to boolean datatype + + This is useful for decoding boolean arrays from integer typed netCDF + variables. + + >>> x = np.array([1, 0, 1, 1, 0], dtype="i1") + + >>> x.dtype + dtype('int8') + + >>> BoolTypeArray(x).dtype + dtype('bool') + + >>> indexer = indexing.BasicIndexer((slice(None),)) + >>> BoolTypeArray(x)[indexer].dtype + dtype('bool') + """ + + __slots__ = ("array",) + + def __init__(self, array) -> None: + self.array = indexing.as_indexable(array) + + @property + def dtype(self) -> np.dtype: + return np.dtype("bool") + + def __getitem__(self, key) -> np.ndarray: + return np.asarray(self.array[key], dtype=self.dtype) + + def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike): """Lazily apply an element-wise function to an array. Parameters @@ -159,27 +224,29 @@ def encode(self, variable: Variable, name: T_Name = None): fv = encoding.get("_FillValue") mv = encoding.get("missing_value") - if ( - fv is not None - and mv is not None - and not duck_array_ops.allclose_or_equiv(fv, mv) - ): + fv_exists = fv is not None + mv_exists = mv is not None + + if not fv_exists and not mv_exists: + return variable + + if fv_exists and mv_exists and not duck_array_ops.allclose_or_equiv(fv, mv): raise ValueError( f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." ) - if fv is not None: + if fv_exists: # Ensure _FillValue is cast to same dtype as data's encoding["_FillValue"] = dtype.type(fv) fill_value = pop_to(encoding, attrs, "_FillValue", name=name) if not pd.isnull(fill_value): data = duck_array_ops.fillna(data, fill_value) - if mv is not None: + if mv_exists: # Ensure missing_value is cast to same dtype as data's encoding["missing_value"] = dtype.type(mv) fill_value = pop_to(encoding, attrs, "missing_value", name=name) - if not pd.isnull(fill_value) and fv is None: + if not pd.isnull(fill_value) and not fv_exists: data = duck_array_ops.fillna(data, fill_value) return Variable(dims, data, attrs, encoding, fastpath=True) @@ -224,7 +291,7 @@ def decode(self, variable: Variable, name: T_Name = None): def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike): - data = np.array(data, dtype=dtype, copy=True) + data = data.astype(dtype=dtype, copy=True) if scale_factor is not None: data *= scale_factor if add_offset is not None: @@ -349,3 +416,101 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable + + +class DefaultFillvalueCoder(VariableCoder): + """Encode default _FillValue if needed.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + dims, data, attrs, encoding = unpack_for_encoding(variable) + # make NaN the fill value for float types + if ( + "_FillValue" not in attrs + and "_FillValue" not in encoding + and np.issubdtype(variable.dtype, np.floating) + ): + attrs["_FillValue"] = variable.dtype.type(np.nan) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + raise NotImplementedError() + + +class BooleanCoder(VariableCoder): + """Code boolean values.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + if ( + (variable.dtype == bool) + and ("dtype" not in variable.encoding) + and ("dtype" not in variable.attrs) + ): + dims, data, attrs, encoding = unpack_for_encoding(variable) + attrs["dtype"] = "bool" + data = duck_array_ops.astype(data, dtype="i1", copy=True) + + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + if variable.attrs.get("dtype", False) == "bool": + dims, data, attrs, encoding = unpack_for_decoding(variable) + # overwrite (!) dtype in encoding, and remove from attrs + # needed for correct subsequent encoding + encoding["dtype"] = attrs.pop("dtype") + data = BoolTypeArray(data) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + +class EndianCoder(VariableCoder): + """Decode Endianness to native.""" + + def encode(self): + raise NotImplementedError() + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + dims, data, attrs, encoding = unpack_for_decoding(variable) + if not data.dtype.isnative: + data = NativeEndiannessArray(data) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + +class NonStringCoder(VariableCoder): + """Encode NonString variables if dtypes differ.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + if "dtype" in variable.encoding and variable.encoding["dtype"] not in ( + "S1", + str, + ): + dims, data, attrs, encoding = unpack_for_encoding(variable) + dtype = np.dtype(encoding.pop("dtype")) + if dtype != variable.dtype: + if np.issubdtype(dtype, np.integer): + if ( + np.issubdtype(variable.dtype, np.floating) + and "_FillValue" not in variable.attrs + and "missing_value" not in variable.attrs + ): + warnings.warn( + f"saving variable {name} with floating " + "point data as an integer dtype without " + "any _FillValue to use for NaNs", + SerializationWarning, + stacklevel=10, + ) + data = np.around(data) + data = data.astype(dtype=dtype) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + def decode(self): + raise NotImplementedError() diff --git a/xarray/conventions.py b/xarray/conventions.py index 780172879c6..1506efc31e8 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -10,7 +10,7 @@ from xarray.coding import strings, times, variables from xarray.coding.variables import SerializationWarning, pop_to -from xarray.core import duck_array_ops, indexing +from xarray.core import indexing from xarray.core.common import ( _contains_datetime_like_objects, contains_cftime_datetimes, @@ -48,123 +48,10 @@ T_DatasetOrAbstractstore = Union[Dataset, AbstractDataStore] -class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Decode arrays on the fly from non-native to native endianness - - This is useful for decoding arrays from netCDF3 files (which are all - big endian) into native endianness, so they can be used with Cython - functions, such as those found in bottleneck and pandas. - - >>> x = np.arange(5, dtype=">i2") - - >>> x.dtype - dtype('>i2') - - >>> NativeEndiannessArray(x).dtype - dtype('int16') - - >>> indexer = indexing.BasicIndexer((slice(None),)) - >>> NativeEndiannessArray(x)[indexer].dtype - dtype('int16') - """ - - __slots__ = ("array",) - - def __init__(self, array): - self.array = indexing.as_indexable(array) - - @property - def dtype(self): - return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) - - def __getitem__(self, key): - return np.asarray(self.array[key], dtype=self.dtype) - - -class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Decode arrays on the fly from integer to boolean datatype - - This is useful for decoding boolean arrays from integer typed netCDF - variables. - - >>> x = np.array([1, 0, 1, 1, 0], dtype="i1") - - >>> x.dtype - dtype('int8') - - >>> BoolTypeArray(x).dtype - dtype('bool') - - >>> indexer = indexing.BasicIndexer((slice(None),)) - >>> BoolTypeArray(x)[indexer].dtype - dtype('bool') - """ - - __slots__ = ("array",) - - def __init__(self, array): - self.array = indexing.as_indexable(array) - - @property - def dtype(self): - return np.dtype("bool") - - def __getitem__(self, key): - return np.asarray(self.array[key], dtype=self.dtype) - - def _var_as_tuple(var: Variable) -> T_VarTuple: return var.dims, var.data, var.attrs.copy(), var.encoding.copy() -def maybe_encode_nonstring_dtype(var: Variable, name: T_Name = None) -> Variable: - if "dtype" in var.encoding and var.encoding["dtype"] not in ("S1", str): - dims, data, attrs, encoding = _var_as_tuple(var) - dtype = np.dtype(encoding.pop("dtype")) - if dtype != var.dtype: - if np.issubdtype(dtype, np.integer): - if ( - np.issubdtype(var.dtype, np.floating) - and "_FillValue" not in var.attrs - and "missing_value" not in var.attrs - ): - warnings.warn( - f"saving variable {name} with floating " - "point data as an integer dtype without " - "any _FillValue to use for NaNs", - SerializationWarning, - stacklevel=10, - ) - data = np.around(data) - data = data.astype(dtype=dtype) - var = Variable(dims, data, attrs, encoding, fastpath=True) - return var - - -def maybe_default_fill_value(var: Variable) -> Variable: - # make NaN the fill value for float types: - if ( - "_FillValue" not in var.attrs - and "_FillValue" not in var.encoding - and np.issubdtype(var.dtype, np.floating) - ): - var.attrs["_FillValue"] = var.dtype.type(np.nan) - return var - - -def maybe_encode_bools(var: Variable) -> Variable: - if ( - (var.dtype == bool) - and ("dtype" not in var.encoding) - and ("dtype" not in var.attrs) - ): - dims, data, attrs, encoding = _var_as_tuple(var) - attrs["dtype"] = "bool" - data = duck_array_ops.astype(data, dtype="i1", copy=True) - var = Variable(dims, data, attrs, encoding, fastpath=True) - return var - - def _infer_dtype(array, name: T_Name = None) -> np.dtype: """Given an object array with no missing values, infer its dtype from its first element @@ -292,13 +179,13 @@ def encode_cf_variable( variables.CFScaleOffsetCoder(), variables.CFMaskCoder(), variables.UnsignedIntegerCoder(), + variables.NonStringCoder(), + variables.DefaultFillvalueCoder(), + variables.BooleanCoder(), ]: var = coder.encode(var, name=name) - # TODO(shoyer): convert all of these to use coders, too: - var = maybe_encode_nonstring_dtype(var, name=name) - var = maybe_default_fill_value(var) - var = maybe_encode_bools(var) + # TODO(kmuehlbauer): check if ensure_dtype_not_object can be moved to backends: var = ensure_dtype_not_object(var, name=name) for attr_name in CF_RELATED_DATA: @@ -389,19 +276,15 @@ def decode_cf_variable( if decode_times: var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name) - dimensions, data, attributes, encoding = variables.unpack_for_decoding(var) - # TODO(shoyer): convert everything below to use coders + if decode_endianness and not var.dtype.isnative: + var = variables.EndianCoder().decode(var) + original_dtype = var.dtype - if decode_endianness and not data.dtype.isnative: - # do this last, so it's only done if we didn't already unmask/scale - data = NativeEndiannessArray(data) - original_dtype = data.dtype + var = variables.BooleanCoder().decode(var) - encoding.setdefault("dtype", original_dtype) + dimensions, data, attributes, encoding = variables.unpack_for_decoding(var) - if "dtype" in attributes and attributes["dtype"] == "bool": - del attributes["dtype"] - data = BoolTypeArray(data) + encoding.setdefault("dtype", original_dtype) if not is_duck_dask_array(data): data = indexing.LazilyIndexedArray(data) diff --git a/xarray/core/accessor_dt.py b/xarray/core/accessor_dt.py index 7e6d4ab82d7..b04683b2f5d 100644 --- a/xarray/core/accessor_dt.py +++ b/xarray/core/accessor_dt.py @@ -115,7 +115,7 @@ def _get_date_field(values, name, dtype): access_method, values, name, dtype=dtype, new_axis=new_axis, chunks=chunks ) else: - return access_method(values, name) + return access_method(values, name).astype(dtype) def _round_through_series_or_index(values, name, freq): diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 05e2ca7eb8b..edebccc2534 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -574,6 +574,8 @@ def align(self) -> None: if self.join == "override": self.override_indexes() + elif self.join == "exact" and not self.copy: + self.results = self.objects else: self.reindex_all() diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 946f71e5d28..8106c295f5a 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -369,9 +369,8 @@ def _nested_combine( return combined -# Define type for arbitrarily-nested list of lists recursively -# Currently mypy cannot handle this but other linters can (https://stackoverflow.com/a/53845083/3154101) -DATASET_HYPERCUBE = Union[Dataset, Iterable["DATASET_HYPERCUBE"]] # type: ignore[misc] +# Define type for arbitrarily-nested list of lists recursively: +DATASET_HYPERCUBE = Union[Dataset, Iterable["DATASET_HYPERCUBE"]] def combine_nested( diff --git a/xarray/core/concat.py b/xarray/core/concat.py index f092911948f..dcf2a23d311 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections.abc import Hashable, Iterable -from typing import TYPE_CHECKING, Any, cast, overload +from typing import TYPE_CHECKING, Any, Union, cast, overload import pandas as pd @@ -27,12 +27,14 @@ JoinOptions, ) + T_DataVars = Union[ConcatOptions, Iterable[Hashable]] + @overload def concat( objs: Iterable[T_Dataset], dim: Hashable | T_DataArray | pd.Index, - data_vars: ConcatOptions | list[Hashable] = "all", + data_vars: T_DataVars = "all", coords: ConcatOptions | list[Hashable] = "different", compat: CompatOptions = "equals", positions: Iterable[Iterable[int]] | None = None, @@ -47,7 +49,7 @@ def concat( def concat( objs: Iterable[T_DataArray], dim: Hashable | T_DataArray | pd.Index, - data_vars: ConcatOptions | list[Hashable] = "all", + data_vars: T_DataVars = "all", coords: ConcatOptions | list[Hashable] = "different", compat: CompatOptions = "equals", positions: Iterable[Iterable[int]] | None = None, @@ -61,7 +63,7 @@ def concat( def concat( objs, dim, - data_vars="all", + data_vars: T_DataVars = "all", coords="different", compat: CompatOptions = "equals", positions=None, @@ -291,7 +293,7 @@ def _calc_concat_dim_index( return dim, index -def _calc_concat_over(datasets, dim, dim_names, data_vars, coords, compat): +def _calc_concat_over(datasets, dim, dim_names, data_vars: T_DataVars, coords, compat): """ Determine which dataset variables need to be concatenated in the result, """ @@ -445,7 +447,7 @@ def _parse_datasets( def _dataset_concat( datasets: list[T_Dataset], dim: str | T_DataArray | pd.Index, - data_vars: str | list[str], + data_vars: T_DataVars, coords: str | list[str], compat: CompatOptions, positions: Iterable[Iterable[int]] | None, @@ -665,7 +667,7 @@ def get_indexes(name): def _dataarray_concat( arrays: Iterable[T_DataArray], dim: str | T_DataArray | pd.Index, - data_vars: str | list[str], + data_vars: T_DataVars, coords: str | list[str], compat: CompatOptions, positions: Iterable[Iterable[int]] | None, diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index f016a298374..6c8cf50a42e 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2,7 +2,7 @@ import datetime import warnings -from collections.abc import Hashable, Iterable, Mapping, Sequence +from collections.abc import Hashable, Iterable, Mapping, MutableMapping, Sequence from os import PathLike from typing import TYPE_CHECKING, Any, Callable, Literal, NoReturn, cast, overload @@ -70,6 +70,7 @@ except ImportError: iris_Cube = None + from xarray.backends import ZarrStore from xarray.backends.api import T_NetcdfEngine, T_NetcdfTypes from xarray.core.groupby import DataArrayGroupBy from xarray.core.resample import DataArrayResample @@ -877,6 +878,12 @@ def encoding(self) -> dict[Any, Any]: def encoding(self, value: Mapping[Any, Any]) -> None: self.variable.encoding = dict(value) + def reset_encoding(self: T_DataArray) -> T_DataArray: + """Return a new DataArray without encoding on the array or any attached + coords.""" + ds = self._to_temp_dataset().reset_encoding() + return self._from_temp_dataset(ds) + @property def indexes(self) -> Indexes: """Mapping of pandas.Index objects used for label based indexing. @@ -3334,9 +3341,9 @@ def interpolate_na( use_coordinate : bool or str, default: True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if - eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is + equally-spaced along ``dim``. If True, the IndexVariable `dim` is used. If ``use_coordinate`` is a string, it specifies the name of a - coordinate variariable to use as the index. + coordinate variable to use as the index. limit : int or None, default: None Maximum number of consecutive NaNs to fill. Must be greater than 0 or None for no limit. This filling is done regardless of the size of @@ -3849,7 +3856,7 @@ def to_netcdf( compute: bool = True, invalid_netcdf: bool = False, ) -> bytes | Delayed | None: - """Write dataset contents to a netCDF file. + """Write DataArray contents to a netCDF file. Parameters ---------- @@ -3963,6 +3970,210 @@ def to_netcdf( invalid_netcdf=invalid_netcdf, ) + # compute=True (default) returns ZarrStore + @overload + def to_zarr( + self, + store: MutableMapping | str | PathLike[str] | None = None, + chunk_store: MutableMapping | str | PathLike | None = None, + mode: Literal["w", "w-", "a", "r+", None] = None, + synchronizer=None, + group: str | None = None, + encoding: Mapping | None = None, + compute: Literal[True] = True, + consolidated: bool | None = None, + append_dim: Hashable | None = None, + region: Mapping[str, slice] | None = None, + safe_chunks: bool = True, + storage_options: dict[str, str] | None = None, + zarr_version: int | None = None, + ) -> ZarrStore: + ... + + # compute=False returns dask.Delayed + @overload + def to_zarr( + self, + store: MutableMapping | str | PathLike[str] | None = None, + chunk_store: MutableMapping | str | PathLike | None = None, + mode: Literal["w", "w-", "a", "r+", None] = None, + synchronizer=None, + group: str | None = None, + encoding: Mapping | None = None, + *, + compute: Literal[False], + consolidated: bool | None = None, + append_dim: Hashable | None = None, + region: Mapping[str, slice] | None = None, + safe_chunks: bool = True, + storage_options: dict[str, str] | None = None, + zarr_version: int | None = None, + ) -> Delayed: + ... + + def to_zarr( + self, + store: MutableMapping | str | PathLike[str] | None = None, + chunk_store: MutableMapping | str | PathLike | None = None, + mode: Literal["w", "w-", "a", "r+", None] = None, + synchronizer=None, + group: str | None = None, + encoding: Mapping | None = None, + compute: bool = True, + consolidated: bool | None = None, + append_dim: Hashable | None = None, + region: Mapping[str, slice] | None = None, + safe_chunks: bool = True, + storage_options: dict[str, str] | None = None, + zarr_version: int | None = None, + ) -> ZarrStore | Delayed: + """Write DataArray contents to a Zarr store + + Zarr chunks are determined in the following way: + + - From the ``chunks`` attribute in each variable's ``encoding`` + (can be set via `DataArray.chunk`). + - If the variable is a Dask array, from the dask chunks + - If neither Dask chunks nor encoding chunks are present, chunks will + be determined automatically by Zarr + - If both Dask chunks and encoding chunks are present, encoding chunks + will be used, provided that there is a many-to-one relationship between + encoding chunks and dask chunks (i.e. Dask chunks are bigger than and + evenly divide encoding chunks); otherwise raise a ``ValueError``. + This restriction ensures that no synchronization / locks are required + when writing. To disable this restriction, use ``safe_chunks=False``. + + Parameters + ---------- + store : MutableMapping, str or path-like, optional + Store or path to directory in local or remote file system. + chunk_store : MutableMapping, str or path-like, optional + Store or path to directory in local or remote file system only for Zarr + array chunks. Requires zarr-python v2.4.0 or later. + mode : {"w", "w-", "a", "r+", None}, optional + Persistence mode: "w" means create (overwrite if exists); + "w-" means create (fail if exists); + "a" means override existing variables (create if does not exist); + "r+" means modify existing array *values* only (raise an error if + any metadata or shapes would change). + The default mode is "a" if ``append_dim`` is set. Otherwise, it is + "r+" if ``region`` is set and ``w-`` otherwise. + synchronizer : object, optional + Zarr array synchronizer. + group : str, optional + Group path. (a.k.a. `path` in zarr terminology.) + encoding : dict, optional + Nested dictionary with variable names as keys and dictionaries of + variable specific encodings as values, e.g., + ``{"my_variable": {"dtype": "int16", "scale_factor": 0.1,}, ...}`` + compute : bool, default: True + If True write array data immediately, otherwise return a + ``dask.delayed.Delayed`` object that can be computed to write + array data later. Metadata is always updated eagerly. + consolidated : bool, optional + If True, apply zarr's `consolidate_metadata` function to the store + after writing metadata and read existing stores with consolidated + metadata; if False, do not. The default (`consolidated=None`) means + write consolidated metadata and attempt to read consolidated + metadata for existing stores (falling back to non-consolidated). + + When the experimental ``zarr_version=3``, ``consolidated`` must be + either be ``None`` or ``False``. + append_dim : hashable, optional + If set, the dimension along which the data will be appended. All + other dimensions on overridden variables must remain the same size. + region : dict, optional + Optional mapping from dimension names to integer slices along + dataarray dimensions to indicate the region of existing zarr array(s) + in which to write this datarray's data. For example, + ``{'x': slice(0, 1000), 'y': slice(10000, 11000)}`` would indicate + that values should be written to the region ``0:1000`` along ``x`` + and ``10000:11000`` along ``y``. + + Two restrictions apply to the use of ``region``: + + - If ``region`` is set, _all_ variables in a dataarray must have at + least one dimension in common with the region. Other variables + should be written in a separate call to ``to_zarr()``. + - Dimensions cannot be included in both ``region`` and + ``append_dim`` at the same time. To create empty arrays to fill + in with ``region``, use a separate call to ``to_zarr()`` with + ``compute=False``. See "Appending to existing Zarr stores" in + the reference documentation for full details. + safe_chunks : bool, default: True + If True, only allow writes to when there is a many-to-one relationship + between Zarr chunks (specified in encoding) and Dask chunks. + Set False to override this restriction; however, data may become corrupted + if Zarr arrays are written in parallel. This option may be useful in combination + with ``compute=False`` to initialize a Zarr store from an existing + DataArray with arbitrary chunk structure. + storage_options : dict, optional + Any additional parameters for the storage backend (ignored for local + paths). + zarr_version : int or None, optional + The desired zarr spec version to target (currently 2 or 3). The + default of None will attempt to determine the zarr version from + ``store`` when possible, otherwise defaulting to 2. + + Returns + ------- + * ``dask.delayed.Delayed`` if compute is False + * ZarrStore otherwise + + References + ---------- + https://zarr.readthedocs.io/ + + Notes + ----- + Zarr chunking behavior: + If chunks are found in the encoding argument or attribute + corresponding to any DataArray, those chunks are used. + If a DataArray is a dask array, it is written with those chunks. + If not other chunks are found, Zarr uses its own heuristics to + choose automatic chunk sizes. + + encoding: + The encoding attribute (if exists) of the DataArray(s) will be + used. Override any existing encodings by providing the ``encoding`` kwarg. + + See Also + -------- + Dataset.to_zarr + :ref:`io.zarr` + The I/O user guide, with more details and examples. + """ + from xarray.backends.api import DATAARRAY_NAME, DATAARRAY_VARIABLE, to_zarr + + if self.name is None: + # If no name is set then use a generic xarray name + dataset = self.to_dataset(name=DATAARRAY_VARIABLE) + elif self.name in self.coords or self.name in self.dims: + # The name is the same as one of the coords names, which the netCDF data model + # does not support, so rename it but keep track of the old name + dataset = self.to_dataset(name=DATAARRAY_VARIABLE) + dataset.attrs[DATAARRAY_NAME] = self.name + else: + # No problems with the name - so we're fine! + dataset = self.to_dataset() + + return to_zarr( # type: ignore[call-overload,misc] + dataset, + store=store, + chunk_store=chunk_store, + mode=mode, + synchronizer=synchronizer, + group=group, + encoding=encoding, + compute=compute, + consolidated=consolidated, + append_dim=append_dim, + region=region, + safe_chunks=safe_chunks, + storage_options=storage_options, + zarr_version=zarr_version, + ) + def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]: """ Convert this xarray.DataArray into a dictionary following xarray diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3cfb5a4f21f..75aec11b9d3 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -666,6 +666,12 @@ def encoding(self) -> dict[Any, Any]: def encoding(self, value: Mapping[Any, Any]) -> None: self._encoding = dict(value) + def reset_encoding(self: T_Dataset) -> T_Dataset: + """Return a new Dataset without encoding on the dataset or any of its + variables/coords.""" + variables = {k: v.reset_encoding() for k, v in self.variables.items()} + return self._replace(variables=variables, encoding={}) + @property def dims(self) -> Frozen[Hashable, int]: """Mapping from dimension names to lengths. @@ -1959,6 +1965,7 @@ def to_zarr( region: Mapping[str, slice] | None = None, safe_chunks: bool = True, storage_options: dict[str, str] | None = None, + zarr_version: int | None = None, ) -> Delayed: ... @@ -2017,7 +2024,7 @@ def to_zarr( Nested dictionary with variable names as keys and dictionaries of variable specific encodings as values, e.g., ``{"my_variable": {"dtype": "int16", "scale_factor": 0.1,}, ...}`` - compute : bool, optional + compute : bool, default: True If True write array data immediately, otherwise return a ``dask.delayed.Delayed`` object that can be computed to write array data later. Metadata is always updated eagerly. @@ -2051,7 +2058,7 @@ def to_zarr( in with ``region``, use a separate call to ``to_zarr()`` with ``compute=False``. See "Appending to existing Zarr stores" in the reference documentation for full details. - safe_chunks : bool, optional + safe_chunks : bool, default: True If True, only allow writes to when there is a many-to-one relationship between Zarr chunks (specified in encoding) and Dask chunks. Set False to override this restriction; however, data may become corrupted @@ -2095,7 +2102,7 @@ def to_zarr( """ from xarray.backends.api import to_zarr - return to_zarr( # type: ignore + return to_zarr( # type: ignore[call-overload,misc] self, store=store, chunk_store=chunk_store, @@ -5630,9 +5637,9 @@ def interpolate_na( use_coordinate : bool or Hashable, default: True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if - eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is + equally-spaced along ``dim``. If True, the IndexVariable `dim` is used. If ``use_coordinate`` is a string, it specifies the name of a - coordinate variariable to use as the index. + coordinate variable to use as the index. limit : int, default: None Maximum number of consecutive NaNs to fill. Must be greater than 0 or None for no limit. This filling is done regardless of the size of diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index ed548771809..7f93706c74c 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -16,7 +16,7 @@ from pandas.errors import OutOfBoundsDatetime from xarray.core.duck_array_ops import array_equiv -from xarray.core.indexing import MemoryCachedArray +from xarray.core.indexing import ExplicitlyIndexed, MemoryCachedArray from xarray.core.options import OPTIONS, _get_boolean_with_default from xarray.core.pycompat import array_type from xarray.core.utils import is_duck_array @@ -114,9 +114,9 @@ def calc_max_rows_last(max_rows: int) -> int: def format_timestamp(t): """Cast given object to a Timestamp and return a nicely formatted string""" - # Timestamp is only valid for 1678 to 2262 try: - datetime_str = str(pd.Timestamp(t)) + timestamp = pd.Timestamp(t) + datetime_str = timestamp.isoformat(sep=" ") except OutOfBoundsDatetime: datetime_str = str(t) @@ -557,8 +557,15 @@ def limit_lines(string: str, *, limit: int): return string -def short_numpy_repr(array): - array = np.asarray(array) +def short_array_repr(array): + from xarray.core.common import AbstractArray + + if isinstance(array, ExplicitlyIndexed): + array = array.get_duck_array() + elif isinstance(array, AbstractArray): + array = array.data + if not is_duck_array(array): + array = np.asarray(array) # default to lower precision so a full (abbreviated) line can fit on # one line with the default display_width @@ -582,11 +589,11 @@ def short_data_repr(array): """Format "data" for DataArray and Variable.""" internal_data = getattr(array, "variable", array)._data if isinstance(array, np.ndarray): - return short_numpy_repr(array) + return short_array_repr(array) elif is_duck_array(internal_data): return limit_lines(repr(array.data), limit=40) elif array._in_memory: - return short_numpy_repr(array) + return short_array_repr(array) else: # internal xarray array type return f"[{array.size} values with dtype={array.dtype}]" @@ -831,7 +838,7 @@ def diff_array_repr(a, b, compat): equiv = array_equiv if not equiv(a.data, b.data): - temp = [wrap_indent(short_numpy_repr(obj), start=" ") for obj in (a, b)] + temp = [wrap_indent(short_array_repr(obj), start=" ") for obj in (a, b)] diff_data_repr = [ ab_side + "\n" + ab_data_repr for ab_side, ab_data_repr in zip(("L", "R"), temp) diff --git a/xarray/core/formatting_html.py b/xarray/core/formatting_html.py index d8d20a9e2c0..60bb901c31a 100644 --- a/xarray/core/formatting_html.py +++ b/xarray/core/formatting_html.py @@ -4,7 +4,7 @@ from collections import OrderedDict from functools import lru_cache, partial from html import escape -from importlib.resources import read_binary +from importlib.resources import files from xarray.core.formatting import ( inline_index_repr, @@ -23,7 +23,7 @@ def _load_static_files(): """Lazily load the resource files into memory the first time they are needed""" return [ - read_binary(package, resource).decode("utf-8") + files(package).joinpath(resource).read_text(encoding="utf-8") for package, resource in STATIC_FILES ] diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index a42141c0efd..2436d540b75 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -435,7 +435,7 @@ def factorize(self, squeeze: bool) -> None: full_index = binned.categories - unique_values = binned.unique().dropna() + unique_values = np.sort(binned.unique().dropna()) group_indices = [g for g in _codes_to_groups(codes, len(full_index)) if g] if len(group_indices) == 0: @@ -572,19 +572,33 @@ def _validate_groupby_squeeze(squeeze): def _resolve_group(obj: T_Xarray, group: T_Group | Hashable) -> T_Group: from xarray.core.dataarray import DataArray - if isinstance(group, (DataArray, IndexVariable)): + error_msg = ( + "the group variable's length does not " + "match the length of this variable along its " + "dimensions" + ) + + newgroup: T_Group + if isinstance(group, DataArray): try: align(obj, group, join="exact", copy=False) except ValueError: - raise ValueError( - "the group variable's length does not " - "match the length of this variable along its " - "dimensions" - ) + raise ValueError(error_msg) newgroup = group.copy() newgroup.name = group.name or "group" + elif isinstance(group, IndexVariable): + # This assumption is built in to _ensure_1d. + if group.ndim != 1: + raise ValueError( + "Grouping by multi-dimensional IndexVariables is not allowed." + "Convert to and pass a DataArray instead." + ) + (group_dim,) = group.dims + if len(group) != obj.sizes[group_dim]: + raise ValueError(error_msg) + else: if not hashable(group): raise TypeError( @@ -1069,7 +1083,7 @@ def quantile( * "nearest" See :py:func:`numpy.quantile` or [1]_ for details. Methods marked with - an asterix require numpy version 1.22 or newer. The "method" argument was + an asterisk require numpy version 1.22 or newer. The "method" argument was previously called "interpolation", renamed in accordance with numpy version 1.22.0. keep_attrs : bool or None, default: None diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 5f42c50e26f..93e9e535fe3 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -15,7 +15,13 @@ PandasIndexingAdapter, PandasMultiIndexingAdapter, ) -from xarray.core.utils import Frozen, get_valid_numpy_dtype, is_dict_like, is_scalar +from xarray.core.utils import ( + Frozen, + emit_user_level_warning, + get_valid_numpy_dtype, + is_dict_like, + is_scalar, +) if TYPE_CHECKING: from xarray.core.types import ErrorOptions, T_Index @@ -135,7 +141,7 @@ def _repr_inline_(self, max_width): def _maybe_cast_to_cftimeindex(index: pd.Index) -> pd.Index: from xarray.coding.cftimeindex import CFTimeIndex - if len(index) > 0 and index.dtype == "O": + if len(index) > 0 and index.dtype == "O" and not isinstance(index, CFTimeIndex): try: return CFTimeIndex(index) except (ImportError, TypeError): @@ -166,9 +172,21 @@ def safe_cast_to_index(array: Any) -> pd.Index: elif isinstance(array, PandasIndexingAdapter): index = array.array else: - kwargs = {} - if hasattr(array, "dtype") and array.dtype.kind == "O": - kwargs["dtype"] = object + kwargs: dict[str, str] = {} + if hasattr(array, "dtype"): + if array.dtype.kind == "O": + kwargs["dtype"] = "object" + elif array.dtype == "float16": + emit_user_level_warning( + ( + "`pandas.Index` does not support the `float16` dtype." + " Casting to `float64` for you, but in the future please" + " manually cast to either `float32` and `float64`." + ), + category=DeprecationWarning, + ) + kwargs["dtype"] = "float64" + index = pd.Index(np.asarray(array), **kwargs) return _maybe_cast_to_cftimeindex(index) @@ -259,6 +277,8 @@ def get_indexer_nd(index, labels, method=None, tolerance=None): labels """ flat_labels = np.ravel(labels) + if flat_labels.dtype == "float16": + flat_labels = flat_labels.astype("float64") flat_indexer = index.get_indexer(flat_labels, method=method, tolerance=tolerance) indexer = flat_indexer.reshape(labels.shape) return indexer diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 7109d4fdd2c..35a5261f248 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -449,13 +449,25 @@ class ExplicitlyIndexed: __slots__ = () + def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: + # Leave casting to an array up to the underlying array type. + return np.asarray(self.get_duck_array(), dtype=dtype) + + def get_duck_array(self): + return self.array + class ExplicitlyIndexedNDArrayMixin(NDArrayMixin, ExplicitlyIndexed): __slots__ = () - def __array__(self, dtype=None): + def get_duck_array(self): key = BasicIndexer((slice(None),) * self.ndim) - return np.asarray(self[key], dtype=dtype) + return self[key] + + def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: + # This is necessary because we apply the indexing key in self.get_duck_array() + # Note this is the base class for all lazy indexing classes + return np.asarray(self.get_duck_array(), dtype=dtype) class ImplicitToExplicitIndexingAdapter(NDArrayMixin): @@ -467,8 +479,11 @@ def __init__(self, array, indexer_cls=BasicIndexer): self.array = as_indexable(array) self.indexer_cls = indexer_cls - def __array__(self, dtype=None): - return np.asarray(self.array, dtype=dtype) + def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: + return np.asarray(self.get_duck_array(), dtype=dtype) + + def get_duck_array(self): + return self.array.get_duck_array() def __getitem__(self, key): key = expanded_indexer(key, self.ndim) @@ -531,9 +546,15 @@ def shape(self) -> tuple[int, ...]: shape.append(k.size) return tuple(shape) - def __array__(self, dtype=None): - array = as_indexable(self.array) - return np.asarray(array[self.key], dtype=None) + def get_duck_array(self): + array = self.array[self.key] + # self.array[self.key] is now a numpy array when + # self.array is a BackendArray subclass + # and self.key is BasicIndexer((slice(None, None, None),)) + # so we need the explicit check for ExplicitlyIndexed + if isinstance(array, ExplicitlyIndexed): + array = array.get_duck_array() + return _wrap_numpy_scalars(array) def transpose(self, order): return LazilyVectorizedIndexedArray(self.array, self.key).transpose(order) @@ -584,8 +605,15 @@ def __init__(self, array, key): def shape(self) -> tuple[int, ...]: return np.broadcast(*self.key.tuple).shape - def __array__(self, dtype=None): - return np.asarray(self.array[self.key], dtype=None) + def get_duck_array(self): + array = self.array[self.key] + # self.array[self.key] is now a numpy array when + # self.array is a BackendArray subclass + # and self.key is BasicIndexer((slice(None, None, None),)) + # so we need the explicit check for ExplicitlyIndexed + if isinstance(array, ExplicitlyIndexed): + array = array.get_duck_array() + return _wrap_numpy_scalars(array) def _updated_key(self, new_key): return _combine_indexers(self.key, self.shape, new_key) @@ -631,8 +659,8 @@ def _ensure_copied(self): self.array = as_indexable(np.array(self.array)) self._copied = True - def __array__(self, dtype=None): - return np.asarray(self.array, dtype=dtype) + def get_duck_array(self): + return self.array.get_duck_array() def __getitem__(self, key): return type(self)(_wrap_numpy_scalars(self.array[key])) @@ -658,12 +686,14 @@ def __init__(self, array): self.array = _wrap_numpy_scalars(as_indexable(array)) def _ensure_cached(self): - if not isinstance(self.array, NumpyIndexingAdapter): - self.array = NumpyIndexingAdapter(np.asarray(self.array)) + self.array = as_indexable(self.array.get_duck_array()) + + def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: + return np.asarray(self.get_duck_array(), dtype=dtype) - def __array__(self, dtype=None): + def get_duck_array(self): self._ensure_cached() - return np.asarray(self.array, dtype=dtype) + return self.array.get_duck_array() def __getitem__(self, key): return type(self)(_wrap_numpy_scalars(self.array[key])) @@ -827,7 +857,7 @@ def explicit_indexing_adapter( result = raw_indexing_method(raw_key.tuple) if numpy_indices.tuple: # index the loaded np.ndarray - result = NumpyIndexingAdapter(np.asarray(result))[numpy_indices] + result = NumpyIndexingAdapter(result)[numpy_indices] return result @@ -1463,6 +1493,9 @@ def __array__(self, dtype: DTypeLike = None) -> np.ndarray: array = array.astype("object") return np.asarray(array.values, dtype=dtype) + def get_duck_array(self) -> np.ndarray: + return np.asarray(self) + @property def shape(self) -> tuple[int, ...]: return (len(self.array),) @@ -1603,9 +1636,9 @@ def _repr_inline_(self, max_width: int) -> str: return format_array_flat(self._get_array_subset(), max_width) def _repr_html_(self) -> str: - from xarray.core.formatting import short_numpy_repr + from xarray.core.formatting import short_array_repr - array_repr = short_numpy_repr(self._get_array_subset()) + array_repr = short_array_repr(self._get_array_subset()) return f"
{escape(array_repr)}
" def copy(self, deep: bool = True) -> PandasMultiIndexingAdapter: diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index b20a96bb8d6..c2db154d614 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -39,6 +39,7 @@ from typing import Literal import pandas as pd +from packaging.version import Version from xarray.coding import cftime_offsets @@ -91,3 +92,15 @@ def _convert_base_to_offset(base, freq, index): return base * freq.as_timedelta() // freq.n else: raise ValueError("Can only resample using a DatetimeIndex or CFTimeIndex.") + + +def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp: + """Return a nanosecond-precision Timestamp object. + + Note this function should no longer be needed after addressing GitHub issue + #7493. + """ + if Version(pd.__version__) >= Version("2.0.0"): + return pd.Timestamp(*args, **kwargs).as_unit("ns") + else: + return pd.Timestamp(*args, **kwargs) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 8b9f31bfdfd..7eb4e9c7687 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -376,7 +376,7 @@ def _construct( window_dim = {d: window_dim_kwargs[str(d)] for d in self.dim} window_dims = self._mapping_to_list( - window_dim, allow_default=False, allow_allsame=False # type: ignore[arg-type] # https://github.com/python/mypy/issues/12506 + window_dim, allow_default=False, allow_allsame=False ) strides = self._mapping_to_list(stride, default=1) @@ -753,7 +753,7 @@ def construct( window_dim = {d: window_dim_kwargs[str(d)] for d in self.dim} window_dims = self._mapping_to_list( - window_dim, allow_default=False, allow_allsame=False # type: ignore[arg-type] # https://github.com/python/mypy/issues/12506 + window_dim, allow_default=False, allow_allsame=False ) strides = self._mapping_to_list(stride, default=1) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 08625fe7d95..1c90a2410f2 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -113,7 +113,7 @@ def get_valid_numpy_dtype(array: np.ndarray | pd.Index): dtype = np.dtype("O") elif hasattr(array, "categories"): # category isn't a real numpy dtype - dtype = array.categories.dtype # type: ignore[union-attr] + dtype = array.categories.dtype elif not is_valid_numpy_dtype(array.dtype): dtype = np.dtype("O") else: diff --git a/xarray/core/variable.py b/xarray/core/variable.py index bddeb85f5e9..9fe3c953aa6 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -426,6 +426,8 @@ def data(self) -> Any: """ if is_duck_array(self._data): return self._data + elif isinstance(self._data, indexing.ExplicitlyIndexed): + return self._data.get_duck_array() else: return self.values @@ -533,6 +535,8 @@ def load(self, **kwargs): """ if is_duck_dask_array(self._data): self._data = as_compatible_data(self._data.compute(**kwargs)) + elif isinstance(self._data, indexing.ExplicitlyIndexed): + self._data = self._data.get_duck_array() elif not is_duck_array(self._data): self._data = np.asarray(self._data) return self @@ -977,6 +981,10 @@ def encoding(self, value): except ValueError: raise ValueError("encoding must be castable to a dictionary") + def reset_encoding(self: T_Variable) -> T_Variable: + """Return a new Variable without encoding.""" + return self._replace(encoding={}) + def copy( self: T_Variable, deep: bool = True, data: ArrayLike | None = None ) -> T_Variable: diff --git a/xarray/plot/dataset_plot.py b/xarray/plot/dataset_plot.py index 0d9898a6e9a..b0774c31b17 100644 --- a/xarray/plot/dataset_plot.py +++ b/xarray/plot/dataset_plot.py @@ -128,7 +128,7 @@ def _dsplot(plotfunc): If ``norm`` has ``vmin`` or ``vmax`` specified, the corresponding kwarg must be ``None``. infer_intervals: bool | None - If True the intervals are infered. + If True the intervals are inferred. center : float, optional The value at which to center the colormap. Passing this value implies use of a diverging colormap. Setting it to ``False`` prevents use of a diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 864b3df8405..7e1b964ecba 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -139,13 +139,18 @@ class UnexpectedDataAccess(Exception): class InaccessibleArray(utils.NDArrayMixin, ExplicitlyIndexed): + """Disallows any loading.""" + def __init__(self, array): self.array = array - def __getitem__(self, key): - raise UnexpectedDataAccess("Tried accessing data.") + def get_duck_array(self): + raise UnexpectedDataAccess("Tried accessing data") + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") - def __array__(self): + def __getitem__(self, key): raise UnexpectedDataAccess("Tried accessing data.") @@ -157,6 +162,23 @@ def __getitem__(self, key): return self.array[tuple_idxr] +class DuckArrayWrapper(utils.NDArrayMixin): + """Array-like that prevents casting to array. + Modeled after cupy.""" + + def __init__(self, array: np.ndarray): + self.array = array + + def __getitem__(self, key): + return type(self)(self.array[key]) + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __array_namespace__(self): + """Present to satisfy is_duck_array test.""" + + class ReturnItem: def __getitem__(self, key): return key diff --git a/xarray/tests/test_accessor_dt.py b/xarray/tests/test_accessor_dt.py index ef91257c4d9..64b487628c8 100644 --- a/xarray/tests/test_accessor_dt.py +++ b/xarray/tests/test_accessor_dt.py @@ -59,6 +59,8 @@ def setup(self): "quarter", "date", "time", + "daysinmonth", + "days_in_month", "is_month_start", "is_month_end", "is_quarter_start", @@ -74,7 +76,18 @@ def test_field_access(self, field) -> None: else: data = getattr(self.times, field) - expected = xr.DataArray(data, name=field, coords=[self.times], dims=["time"]) + if data.dtype.kind != "b" and field not in ("date", "time"): + # pandas 2.0 returns int32 for integer fields now + data = data.astype("int64") + + translations = { + "weekday": "dayofweek", + "daysinmonth": "days_in_month", + "weekofyear": "week", + } + name = translations.get(field, field) + + expected = xr.DataArray(data, name=name, coords=[self.times], dims=["time"]) if field in ["week", "weekofyear"]: with pytest.warns( @@ -84,7 +97,8 @@ def test_field_access(self, field) -> None: else: actual = getattr(self.data.time.dt, field) - assert_equal(expected, actual) + assert expected.dtype == actual.dtype + assert_identical(expected, actual) @pytest.mark.parametrize( "field, pandas_field", diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 12e101a475d..7d58c5bfed2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -630,6 +630,11 @@ def test_roundtrip_boolean_dtype(self) -> None: with self.roundtrip(original) as actual: assert_identical(original, actual) assert actual["x"].dtype == "bool" + # this checks for preserving dtype during second roundtrip + # see https://github.com/pydata/xarray/issues/7652#issuecomment-1476956975 + with self.roundtrip(actual) as actual2: + assert_identical(original, actual2) + assert actual2["x"].dtype == "bool" def test_orthogonal_indexing(self) -> None: in_memory = create_test_data() @@ -3237,6 +3242,21 @@ def chunks(request): return request.param +@pytest.fixture(params=["tmp_path", "ZipStore", "Dict"]) +def tmp_store(request, tmp_path): + if request.param == "tmp_path": + return tmp_path + elif request.param == "ZipStore": + from zarr.storage import ZipStore + + path = tmp_path / "store.zip" + return ZipStore(path) + elif request.param == "Dict": + return dict() + else: + raise ValueError("not supported") + + # using pytest.mark.skipif does not work so this a work around def skip_if_not_engine(engine): if engine == "netcdf4": @@ -4591,6 +4611,56 @@ def test_dataarray_to_netcdf_no_name_pathlib(self) -> None: assert_identical(original_da, loaded_da) +@requires_zarr +class TestDataArrayToZarr: + def test_dataarray_to_zarr_no_name(self, tmp_store) -> None: + original_da = DataArray(np.arange(12).reshape((3, 4))) + + original_da.to_zarr(tmp_store) + + with open_dataarray(tmp_store, engine="zarr") as loaded_da: + assert_identical(original_da, loaded_da) + + def test_dataarray_to_zarr_with_name(self, tmp_store) -> None: + original_da = DataArray(np.arange(12).reshape((3, 4)), name="test") + + original_da.to_zarr(tmp_store) + + with open_dataarray(tmp_store, engine="zarr") as loaded_da: + assert_identical(original_da, loaded_da) + + def test_dataarray_to_zarr_coord_name_clash(self, tmp_store) -> None: + original_da = DataArray( + np.arange(12).reshape((3, 4)), dims=["x", "y"], name="x" + ) + + original_da.to_zarr(tmp_store) + + with open_dataarray(tmp_store, engine="zarr") as loaded_da: + assert_identical(original_da, loaded_da) + + def test_open_dataarray_options(self, tmp_store) -> None: + data = DataArray(np.arange(5), coords={"y": ("x", range(5))}, dims=["x"]) + + data.to_zarr(tmp_store) + + expected = data.drop_vars("y") + with open_dataarray(tmp_store, engine="zarr", drop_variables=["y"]) as loaded: + assert_identical(expected, loaded) + + @requires_dask + def test_dataarray_to_zarr_compute_false(self, tmp_store) -> None: + from dask.delayed import Delayed + + original_da = DataArray(np.arange(12).reshape((3, 4))) + + output = original_da.to_zarr(tmp_store, compute=False) + assert isinstance(output, Delayed) + output.compute() + with open_dataarray(tmp_store, engine="zarr") as loaded_da: + assert_identical(original_da, loaded_da) + + @requires_scipy_or_netCDF4 def test_no_warning_from_dask_effective_get() -> None: with create_tmp_file() as tmpfile: @@ -5023,7 +5093,7 @@ def test_scipy_entrypoint(tmp_path: Path) -> None: assert entrypoint.guess_can_open("something-local.nc") assert entrypoint.guess_can_open("something-local.nc.gz") assert not entrypoint.guess_can_open("not-found-and-no-extension") - assert not entrypoint.guess_can_open(b"not-a-netcdf-file") + assert not entrypoint.guess_can_open(b"not-a-netcdf-file") # type: ignore[arg-type] @requires_h5netcdf diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 6b628c15488..24ffab305ad 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -1373,6 +1373,7 @@ def test_date_range_like_same_calendar(): assert src is out +@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_date_range_like_errors(): src = date_range("1899-02-03", periods=20, freq="D", use_cftime=False) src = src[np.arange(20) != 10] # Remove 1 day so the frequency is not inferable. diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index f60308f8863..030f653e031 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -297,6 +297,7 @@ def test_concat_multiple_datasets_with_multiple_missing_variables() -> None: assert_identical(actual, expected) +@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_concat_type_of_missing_fill() -> None: datasets = create_typed_datasets(2, seed=123) expected1 = concat(datasets, dim="day", fill_value=dtypes.NA) @@ -537,8 +538,7 @@ def test_concat_data_vars_typing(self) -> None: actual = concat(objs, dim="x", data_vars="minimal") assert_identical(data, actual) - def test_concat_data_vars(self): - # TODO: annotating this func fails + def test_concat_data_vars(self) -> None: data = Dataset({"foo": ("x", np.random.randn(10))}) objs: list[Dataset] = [data.isel(x=slice(5)), data.isel(x=slice(5, None))] for data_vars in ["minimal", "different", "all", [], ["foo"]]: diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 9485b506b89..acdf9c8846e 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -32,7 +32,7 @@ class TestBoolTypeArray: def test_booltype_array(self) -> None: x = np.array([1, 0, 1, 1, 0], dtype="i1") - bx = conventions.BoolTypeArray(x) + bx = coding.variables.BoolTypeArray(x) assert bx.dtype == bool assert_array_equal(bx, np.array([True, False, True, True, False], dtype=bool)) @@ -41,7 +41,7 @@ class TestNativeEndiannessArray: def test(self) -> None: x = np.arange(5, dtype=">i8") expected = np.arange(5, dtype="int64") - a = conventions.NativeEndiannessArray(x) + a = coding.variables.NativeEndiannessArray(x) assert a.dtype == expected.dtype assert a.dtype == expected[:].dtype assert_array_equal(a, expected) @@ -168,6 +168,7 @@ def test_do_not_overwrite_user_coordinates(self) -> None: with pytest.raises(ValueError, match=r"'coordinates' found in both attrs"): conventions.encode_dataset_coordinates(orig) + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_emit_coordinates_attribute_in_attrs(self) -> None: orig = Dataset( {"a": 1, "b": 1}, @@ -185,6 +186,7 @@ def test_emit_coordinates_attribute_in_attrs(self) -> None: assert enc["b"].attrs.get("coordinates") == "t" assert "coordinates" not in enc["b"].encoding + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_emit_coordinates_attribute_in_encoding(self) -> None: orig = Dataset( {"a": 1, "b": 1}, @@ -247,7 +249,7 @@ def test_decode_coordinates(self) -> None: def test_0d_int32_encoding(self) -> None: original = Variable((), np.int32(0), encoding={"dtype": "int64"}) expected = Variable((), np.int64(0)) - actual = conventions.maybe_encode_nonstring_dtype(original) + actual = coding.variables.NonStringCoder().encode(original) assert_identical(expected, actual) def test_decode_cf_with_multiple_missing_values(self) -> None: diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 21f0ab93d78..b37399d6ef8 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1701,3 +1701,10 @@ def test_graph_manipulation(): # names if we were to use HighLevelGraph.cull() instead of # HighLevelGraph.cull_layers() in Dataset.__dask_postpersist__(). assert_equal(ds2.d1 + ds2.d2, ds.d1 + ds.d2) + + +def test_new_index_var_computes_once(): + # regression test for GH1533 + data = dask.array.from_array(np.array([100, 200])) + with raise_if_dask_computes(max_computes=1): + Dataset(coords={"z": ("z", data)}) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 378d471ba6b..dcbfd42c9f1 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -278,6 +278,25 @@ def test_encoding(self) -> None: self.dv.encoding = expected2 assert expected2 is not self.dv.encoding + def test_reset_encoding(self) -> None: + array = self.mda + encoding = {"scale_factor": 10} + array.encoding = encoding + array["x"].encoding = encoding + + assert array.encoding == encoding + assert array["x"].encoding == encoding + + actual = array.reset_encoding() + + # did not modify in place + assert array.encoding == encoding + assert array["x"].encoding == encoding + + # variable and coord encoding is empty + assert actual.encoding == {} + assert actual["x"].encoding == {} + def test_constructor(self) -> None: data = np.random.random((2, 3)) @@ -1004,32 +1023,53 @@ def test_sel_dataarray_datetime_slice(self) -> None: result = array.sel(delta=slice(array.delta[0], array.delta[-1])) assert_equal(result, array) - def test_sel_float(self) -> None: + @pytest.mark.parametrize( + ["coord_values", "indices"], + ( + pytest.param( + np.array([0.0, 0.111, 0.222, 0.333], dtype="float64"), + slice(1, 3), + id="float64", + ), + pytest.param( + np.array([0.0, 0.111, 0.222, 0.333], dtype="float32"), + slice(1, 3), + id="float32", + ), + pytest.param( + np.array([0.0, 0.111, 0.222, 0.333], dtype="float32"), [2], id="scalar" + ), + ), + ) + def test_sel_float(self, coord_values, indices) -> None: data_values = np.arange(4) - # case coords are float32 and label is list of floats - float_values = [0.0, 0.111, 0.222, 0.333] - coord_values = np.asarray(float_values, dtype="float32") - array = DataArray(data_values, [("float32_coord", coord_values)]) - expected = DataArray(data_values[1:3], [("float32_coord", coord_values[1:3])]) - actual = array.sel(float32_coord=float_values[1:3]) - # case coords are float16 and label is list of floats - coord_values_16 = np.asarray(float_values, dtype="float16") - expected_16 = DataArray( - data_values[1:3], [("float16_coord", coord_values_16[1:3])] - ) - array_16 = DataArray(data_values, [("float16_coord", coord_values_16)]) - actual_16 = array_16.sel(float16_coord=float_values[1:3]) + arr = DataArray(data_values, coords={"x": coord_values}, dims="x") - # case coord, label are scalars - expected_scalar = DataArray( - data_values[2], coords={"float32_coord": coord_values[2]} + actual = arr.sel(x=coord_values[indices]) + expected = DataArray( + data_values[indices], coords={"x": coord_values[indices]}, dims="x" ) - actual_scalar = array.sel(float32_coord=float_values[2]) - assert_equal(expected, actual) - assert_equal(expected_scalar, actual_scalar) - assert_equal(expected_16, actual_16) + assert_equal(actual, expected) + + def test_sel_float16(self) -> None: + data_values = np.arange(4) + coord_values = np.array([0.0, 0.111, 0.222, 0.333], dtype="float16") + indices = slice(1, 3) + + message = "`pandas.Index` does not support the `float16` dtype.*" + + with pytest.warns(DeprecationWarning, match=message): + arr = DataArray(data_values, coords={"x": coord_values}, dims="x") + with pytest.warns(DeprecationWarning, match=message): + expected = DataArray( + data_values[indices], coords={"x": coord_values[indices]}, dims="x" + ) + + actual = arr.sel(x=coord_values[indices]) + + assert_equal(actual, expected) def test_sel_float_multiindex(self) -> None: # regression test https://github.com/pydata/xarray/issues/5691 diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 2e23d02a261..45286727f0a 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -36,6 +36,7 @@ from xarray.core.pycompat import array_type, integer_types from xarray.core.utils import is_scalar from xarray.tests import ( + DuckArrayWrapper, InaccessibleArray, UnexpectedDataAccess, assert_allclose, @@ -101,57 +102,63 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: bool_var = np.array([True, False, True], dtype=bool) bool_var_to_append = np.array([False, True], dtype=bool) - ds = xr.Dataset( - data_vars={ - "da": xr.DataArray( - rs.rand(3, 3, nt1), - coords=[lat, lon, time1], - dims=["lat", "lon", "time"], - ), - "string_var": xr.DataArray(string_var, coords=[time1], dims=["time"]), - "string_var_fixed_length": xr.DataArray( - string_var_fixed_length, coords=[time1], dims=["time"] - ), - "unicode_var": xr.DataArray( - unicode_var, coords=[time1], dims=["time"] - ).astype(np.unicode_), - "datetime_var": xr.DataArray(datetime_var, coords=[time1], dims=["time"]), - "bool_var": xr.DataArray(bool_var, coords=[time1], dims=["time"]), - } - ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Converting non-nanosecond") + ds = xr.Dataset( + data_vars={ + "da": xr.DataArray( + rs.rand(3, 3, nt1), + coords=[lat, lon, time1], + dims=["lat", "lon", "time"], + ), + "string_var": xr.DataArray(string_var, coords=[time1], dims=["time"]), + "string_var_fixed_length": xr.DataArray( + string_var_fixed_length, coords=[time1], dims=["time"] + ), + "unicode_var": xr.DataArray( + unicode_var, coords=[time1], dims=["time"] + ).astype(np.unicode_), + "datetime_var": xr.DataArray( + datetime_var, coords=[time1], dims=["time"] + ), + "bool_var": xr.DataArray(bool_var, coords=[time1], dims=["time"]), + } + ) - ds_to_append = xr.Dataset( - data_vars={ - "da": xr.DataArray( - rs.rand(3, 3, nt2), - coords=[lat, lon, time2], - dims=["lat", "lon", "time"], - ), - "string_var": xr.DataArray( - string_var_to_append, coords=[time2], dims=["time"] - ), - "string_var_fixed_length": xr.DataArray( - string_var_fixed_length_to_append, coords=[time2], dims=["time"] - ), - "unicode_var": xr.DataArray( - unicode_var[:nt2], coords=[time2], dims=["time"] - ).astype(np.unicode_), - "datetime_var": xr.DataArray( - datetime_var_to_append, coords=[time2], dims=["time"] - ), - "bool_var": xr.DataArray(bool_var_to_append, coords=[time2], dims=["time"]), - } - ) + ds_to_append = xr.Dataset( + data_vars={ + "da": xr.DataArray( + rs.rand(3, 3, nt2), + coords=[lat, lon, time2], + dims=["lat", "lon", "time"], + ), + "string_var": xr.DataArray( + string_var_to_append, coords=[time2], dims=["time"] + ), + "string_var_fixed_length": xr.DataArray( + string_var_fixed_length_to_append, coords=[time2], dims=["time"] + ), + "unicode_var": xr.DataArray( + unicode_var[:nt2], coords=[time2], dims=["time"] + ).astype(np.unicode_), + "datetime_var": xr.DataArray( + datetime_var_to_append, coords=[time2], dims=["time"] + ), + "bool_var": xr.DataArray( + bool_var_to_append, coords=[time2], dims=["time"] + ), + } + ) - ds_with_new_var = xr.Dataset( - data_vars={ - "new_var": xr.DataArray( - rs.rand(3, 3, nt1 + nt2), - coords=[lat, lon, time1.append(time2)], - dims=["lat", "lon", "time"], - ) - } - ) + ds_with_new_var = xr.Dataset( + data_vars={ + "new_var": xr.DataArray( + rs.rand(3, 3, nt1 + nt2), + coords=[lat, lon, time1.append(time2)], + dims=["lat", "lon", "time"], + ) + } + ) assert all(objp.data.flags.writeable for objp in ds.variables.values()) assert all(objp.data.flags.writeable for objp in ds_to_append.variables.values()) @@ -203,6 +210,10 @@ def create_test_stacked_array() -> tuple[DataArray, DataArray]: class InaccessibleVariableDataStore(backends.InMemoryDataStore): + """ + Store that does not allow any data access. + """ + def __init__(self): super().__init__() self._indexvars = set() @@ -223,6 +234,47 @@ def lazy_inaccessible(k, v): return {k: lazy_inaccessible(k, v) for k, v in self._variables.items()} +class DuckBackendArrayWrapper(backends.common.BackendArray): + """Mimic a BackendArray wrapper around DuckArrayWrapper""" + + def __init__(self, array): + self.array = DuckArrayWrapper(array) + self.shape = array.shape + self.dtype = array.dtype + + def get_array(self): + return self.array + + def __getitem__(self, key): + return self.array[key.tuple] + + +class AccessibleAsDuckArrayDataStore(backends.InMemoryDataStore): + """ + Store that returns a duck array, not convertible to numpy array, + on read. Modeled after nVIDIA's kvikio. + """ + + def __init__(self): + super().__init__() + self._indexvars = set() + + def store(self, variables, *args, **kwargs) -> None: + super().store(variables, *args, **kwargs) + for k, v in variables.items(): + if isinstance(v, IndexVariable): + self._indexvars.add(k) + + def get_variables(self) -> dict[Any, xr.Variable]: + def lazy_accessible(k, v) -> xr.Variable: + if k in self._indexvars: + return v + data = indexing.LazilyIndexedArray(DuckBackendArrayWrapper(v.values)) + return Variable(v.dims, data, v.attrs) + + return {k: lazy_accessible(k, v) for k, v in self._variables.items()} + + class TestDataset: def test_repr(self) -> None: data = create_test_data(seed=123) @@ -443,6 +495,7 @@ def test_constructor_1d(self) -> None: actual = Dataset({"x": [5, 6, 7, 8, 9]}) assert_identical(expected, actual) + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_constructor_0d(self) -> None: expected = Dataset({"x": ([], 1)}) for arg in [1, np.array(1), expected["x"]]: @@ -2827,6 +2880,21 @@ def test_copy_with_data_errors(self) -> None: with pytest.raises(ValueError, match=r"contain all variables in original"): orig.copy(data={"var1": new_var1}) + def test_reset_encoding(self) -> None: + orig = create_test_data() + vencoding = {"scale_factor": 10} + orig.encoding = {"foo": "bar"} + + for k, v in orig.variables.items(): + orig[k].encoding = vencoding + + actual = orig.reset_encoding() + assert actual.encoding == {} + for k, v in actual.variables.items(): + assert v.encoding == {} + + assert_equal(actual, orig) + def test_rename(self) -> None: data = create_test_data() newnames = { @@ -4684,6 +4752,29 @@ def test_lazy_load(self) -> None: ds.isel(time=10) ds.isel(time=slice(10), dim1=[0]).isel(dim1=0, dim2=-1) + def test_lazy_load_duck_array(self) -> None: + store = AccessibleAsDuckArrayDataStore() + create_test_data().dump_to_store(store) + + for decode_cf in [True, False]: + ds = open_dataset(store, decode_cf=decode_cf) + with pytest.raises(UnexpectedDataAccess): + ds["var1"].values + + # these should not raise UnexpectedDataAccess: + ds.var1.data + ds.isel(time=10) + ds.isel(time=slice(10), dim1=[0]).isel(dim1=0, dim2=-1) + repr(ds) + + # preserve the duck array type and don't cast to array + assert isinstance(ds["var1"].load().data, DuckArrayWrapper) + assert isinstance( + ds["var1"].isel(dim2=0, dim1=0).load().data, DuckArrayWrapper + ) + + ds.close() + def test_dropna(self) -> None: x = np.random.randn(4, 4) x[::2, 0] = np.nan @@ -5495,6 +5586,7 @@ def test_dataset_math_auto_align(self) -> None: expected = ds + other.reindex_like(ds) assert_identical(expected, actual) + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_dataset_math_errors(self) -> None: ds = self.make_example_math_dataset() @@ -6494,6 +6586,7 @@ def test_differentiate(dask, edge_order) -> None: da.differentiate("x2d") +@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") @pytest.mark.parametrize("dask", [True, False]) def test_differentiate_datetime(dask) -> None: rs = np.random.RandomState(42) @@ -6690,6 +6783,7 @@ def test_cumulative_integrate(dask) -> None: da.cumulative_integrate("x2d") +@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") @pytest.mark.parametrize("dask", [True, False]) @pytest.mark.parametrize("which_datetime", ["np", "cftime"]) def test_trapz_datetime(dask, which_datetime) -> None: diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 3cba5b965f9..bf5f7d0bdc5 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -542,7 +542,7 @@ def test_set_numpy_options() -> None: assert np.get_printoptions() == original_options -def test_short_numpy_repr() -> None: +def test_short_array_repr() -> None: cases = [ np.random.randn(500), np.random.randn(20, 20), @@ -552,16 +552,16 @@ def test_short_numpy_repr() -> None: ] # number of lines: # for default numpy repr: 167, 140, 254, 248, 599 - # for short_numpy_repr: 1, 7, 24, 19, 25 + # for short_array_repr: 1, 7, 24, 19, 25 for array in cases: - num_lines = formatting.short_numpy_repr(array).count("\n") + 1 + num_lines = formatting.short_array_repr(array).count("\n") + 1 assert num_lines < 30 # threshold option (default: 200) array2 = np.arange(100) - assert "..." not in formatting.short_numpy_repr(array2) + assert "..." not in formatting.short_array_repr(array2) with xr.set_options(display_values_threshold=10): - assert "..." in formatting.short_numpy_repr(array2) + assert "..." in formatting.short_array_repr(array2) def test_large_array_repr_length() -> None: diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 73a5b6494a3..ca9e0f40cc3 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -501,6 +501,7 @@ def test_groupby_repr_datetime(obj) -> None: assert actual == expected +@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") @pytest.mark.filterwarnings("ignore:invalid value encountered in divide:RuntimeWarning") def test_groupby_drops_nans() -> None: # GH2383 @@ -1370,36 +1371,51 @@ def test_groupby_multidim_map(self): ) assert_identical(expected, actual) - def test_groupby_bins(self): - array = DataArray(np.arange(4), dims="dim_0") + @pytest.mark.parametrize("use_flox", [True, False]) + @pytest.mark.parametrize("coords", [np.arange(4), np.arange(4)[::-1], [2, 0, 3, 1]]) + def test_groupby_bins(self, coords: np.typing.ArrayLike, use_flox: bool) -> None: + array = DataArray( + np.arange(4), dims="dim_0", coords={"dim_0": coords}, name="a" + ) # the first value should not be part of any group ("right" binning) array[0] = 99 # bins follow conventions for pandas.cut # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html bins = [0, 1.5, 5] - bin_coords = pd.cut(array["dim_0"], bins).categories - expected = DataArray( - [1, 5], dims="dim_0_bins", coords={"dim_0_bins": bin_coords} + + df = array.to_dataframe() + df["dim_0_bins"] = pd.cut(array["dim_0"], bins) + + expected_df = df.groupby("dim_0_bins").sum() + # TODO: can't convert df with IntervalIndex to Xarray + + expected = ( + expected_df.reset_index(drop=True) + .to_xarray() + .assign_coords(index=np.array(expected_df.index)) + .rename({"index": "dim_0_bins"})["a"] ) - actual = array.groupby_bins("dim_0", bins=bins).sum() - assert_identical(expected, actual) - actual = array.groupby_bins("dim_0", bins=bins, labels=[1.2, 3.5]).sum() - assert_identical(expected.assign_coords(dim_0_bins=[1.2, 3.5]), actual) + with xr.set_options(use_flox=use_flox): + actual = array.groupby_bins("dim_0", bins=bins).sum() + assert_identical(expected, actual) - actual = array.groupby_bins("dim_0", bins=bins).map(lambda x: x.sum()) - assert_identical(expected, actual) + actual = array.groupby_bins("dim_0", bins=bins, labels=[1.2, 3.5]).sum() + assert_identical(expected.assign_coords(dim_0_bins=[1.2, 3.5]), actual) - # make sure original array dims are unchanged - assert len(array.dim_0) == 4 + actual = array.groupby_bins("dim_0", bins=bins).map(lambda x: x.sum()) + assert_identical(expected, actual) - da = xr.DataArray(np.ones((2, 3, 4))) - bins = [-1, 0, 1, 2] - with xr.set_options(use_flox=False): - actual = da.groupby_bins("dim_0", bins).mean(...) - with xr.set_options(use_flox=True): - expected = da.groupby_bins("dim_0", bins).mean(...) - assert_allclose(actual, expected) + # make sure original array dims are unchanged + assert len(array.dim_0) == 4 + + da = xr.DataArray(np.ones((2, 3, 4))) + bins = [-1, 0, 1, 2] + with xr.set_options(use_flox=False): + actual = da.groupby_bins("dim_0", bins).mean(...) + with xr.set_options(use_flox=True): + expected = da.groupby_bins("dim_0", bins).mean(...) + assert_allclose(actual, expected) def test_groupby_bins_empty(self): array = DataArray(np.arange(4), [("x", range(4))]) @@ -1805,6 +1821,7 @@ def test_upsample_interpolate(self): assert_allclose(expected, actual, rtol=1e-16) @requires_scipy + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_upsample_interpolate_bug_2197(self): dates = pd.date_range("2007-02-01", "2007-03-01", freq="D") da = xr.DataArray(np.arange(len(dates)), [("time", dates)]) diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index e66045e978d..026edf96b62 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -605,6 +605,7 @@ def test_interp_like() -> None: pytest.param("2000-01-01T12:00", 0.5, marks=pytest.mark.xfail), ], ) +@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_datetime(x_new, expected) -> None: da = xr.DataArray( np.arange(24), diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 02f7f4b9be2..18ca49670ba 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -2042,7 +2042,7 @@ def test_seaborn_palette_as_cmap(self) -> None: def test_convenient_facetgrid(self) -> None: a = easy_array((10, 15, 4)) d = DataArray(a, dims=["y", "x", "z"]) - g = self.plotfunc(d, x="x", y="y", col="z", col_wrap=2) + g = self.plotfunc(d, x="x", y="y", col="z", col_wrap=2) # type: ignore[arg-type] # https://github.com/python/mypy/issues/15015 assert_array_equal(g.axs.shape, [2, 2]) for (y, x), ax in np.ndenumerate(g.axs): @@ -2051,7 +2051,7 @@ def test_convenient_facetgrid(self) -> None: assert "x" == ax.get_xlabel() # Inferring labels - g = self.plotfunc(d, col="z", col_wrap=2) + g = self.plotfunc(d, col="z", col_wrap=2) # type: ignore[arg-type] # https://github.com/python/mypy/issues/15015 assert_array_equal(g.axs.shape, [2, 2]) for (y, x), ax in np.ndenumerate(g.axs): assert ax.has_data() @@ -2811,6 +2811,7 @@ def test_datetime_plot1d(self) -> None: # mpl.dates.AutoDateLocator passes and no other subclasses: assert type(ax.xaxis.get_major_locator()) is mpl.dates.AutoDateLocator + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_datetime_plot2d(self) -> None: # Test that matplotlib-native datetime works: da = DataArray( diff --git a/xarray/tests/test_plugins.py b/xarray/tests/test_plugins.py index 421be1df2dc..0882bc1b570 100644 --- a/xarray/tests/test_plugins.py +++ b/xarray/tests/test_plugins.py @@ -2,11 +2,27 @@ import sys from importlib.metadata import EntryPoint + +if sys.version_info >= (3, 10): + from importlib.metadata import EntryPoints +else: + EntryPoints = list[EntryPoint] from unittest import mock import pytest from xarray.backends import common, plugins +from xarray.tests import ( + has_h5netcdf, + has_netCDF4, + has_pseudonetcdf, + has_pydap, + has_pynio, + has_scipy, + has_zarr, +) + +# Do not import list_engines here, this will break the lazy tests importlib_metadata_mock = "importlib.metadata" @@ -57,7 +73,7 @@ def test_broken_plugin() -> None: "xarray.backends", ) with pytest.warns(RuntimeWarning) as record: - _ = plugins.build_engines([broken_backend]) + _ = plugins.build_engines(EntryPoints([broken_backend])) assert len(record) == 1 message = str(record[0].message) assert "Engine 'broken_backend'" in message @@ -99,23 +115,29 @@ def test_set_missing_parameters() -> None: assert backend_1.open_dataset_parameters == ("filename_or_obj", "decoder") assert backend_2.open_dataset_parameters == ("filename_or_obj",) - backend = DummyBackendEntrypointKwargs() - backend.open_dataset_parameters = ("filename_or_obj", "decoder") # type: ignore[misc] - plugins.set_missing_parameters({"engine": backend}) - assert backend.open_dataset_parameters == ("filename_or_obj", "decoder") + backend_kwargs = DummyBackendEntrypointKwargs + backend_kwargs.open_dataset_parameters = ("filename_or_obj", "decoder") + plugins.set_missing_parameters({"engine": backend_kwargs}) + assert backend_kwargs.open_dataset_parameters == ("filename_or_obj", "decoder") - backend_args = DummyBackendEntrypointArgs() - backend_args.open_dataset_parameters = ("filename_or_obj", "decoder") # type: ignore[misc] + backend_args = DummyBackendEntrypointArgs + backend_args.open_dataset_parameters = ("filename_or_obj", "decoder") plugins.set_missing_parameters({"engine": backend_args}) assert backend_args.open_dataset_parameters == ("filename_or_obj", "decoder") + # reset + backend_1.open_dataset_parameters = None + backend_1.open_dataset_parameters = None + backend_kwargs.open_dataset_parameters = None + backend_args.open_dataset_parameters = None + def test_set_missing_parameters_raise_error() -> None: - backend = DummyBackendEntrypointKwargs() + backend = DummyBackendEntrypointKwargs with pytest.raises(TypeError): plugins.set_missing_parameters({"engine": backend}) - backend_args = DummyBackendEntrypointArgs() + backend_args = DummyBackendEntrypointArgs with pytest.raises(TypeError): plugins.set_missing_parameters({"engine": backend_args}) @@ -128,7 +150,7 @@ def test_build_engines() -> None: dummy_pkg_entrypoint = EntryPoint( "dummy", "xarray.tests.test_plugins:backend_1", "xarray_backends" ) - backend_entrypoints = plugins.build_engines([dummy_pkg_entrypoint]) + backend_entrypoints = plugins.build_engines(EntryPoints([dummy_pkg_entrypoint])) assert isinstance(backend_entrypoints["dummy"], DummyBackendEntrypoint1) assert backend_entrypoints["dummy"].open_dataset_parameters == ( @@ -142,10 +164,16 @@ def test_build_engines() -> None: mock.MagicMock(return_value=DummyBackendEntrypoint1), ) def test_build_engines_sorted() -> None: - dummy_pkg_entrypoints = [ - EntryPoint("dummy2", "xarray.tests.test_plugins:backend_1", "xarray.backends"), - EntryPoint("dummy1", "xarray.tests.test_plugins:backend_1", "xarray.backends"), - ] + dummy_pkg_entrypoints = EntryPoints( + [ + EntryPoint( + "dummy2", "xarray.tests.test_plugins:backend_1", "xarray.backends" + ), + EntryPoint( + "dummy1", "xarray.tests.test_plugins:backend_1", "xarray.backends" + ), + ] + ) backend_entrypoints = list(plugins.build_engines(dummy_pkg_entrypoints)) indices = [] @@ -239,3 +267,57 @@ def test_lazy_import() -> None: finally: # restore original sys.modules.update(modules_backup) + + +def test_list_engines() -> None: + from xarray.backends import list_engines + + engines = list_engines() + assert list_engines.cache_info().currsize == 1 + + assert ("scipy" in engines) == has_scipy + assert ("h5netcdf" in engines) == has_h5netcdf + assert ("netcdf4" in engines) == has_netCDF4 + assert ("pseudonetcdf" in engines) == has_pseudonetcdf + assert ("pydap" in engines) == has_pydap + assert ("zarr" in engines) == has_zarr + assert ("pynio" in engines) == has_pynio + assert "store" in engines + + +def test_refresh_engines() -> None: + from xarray.backends import list_engines, refresh_engines + + EntryPointMock1 = mock.MagicMock() + EntryPointMock1.name = "test1" + EntryPointMock1.load.return_value = DummyBackendEntrypoint1 + + if sys.version_info >= (3, 10): + return_value = EntryPoints([EntryPointMock1]) + else: + return_value = {"xarray.backends": [EntryPointMock1]} + + with mock.patch("xarray.backends.plugins.entry_points", return_value=return_value): + list_engines.cache_clear() + engines = list_engines() + assert "test1" in engines + assert isinstance(engines["test1"], DummyBackendEntrypoint1) + + EntryPointMock2 = mock.MagicMock() + EntryPointMock2.name = "test2" + EntryPointMock2.load.return_value = DummyBackendEntrypoint2 + + if sys.version_info >= (3, 10): + return_value2 = EntryPoints([EntryPointMock2]) + else: + return_value2 = {"xarray.backends": [EntryPointMock2]} + + with mock.patch("xarray.backends.plugins.entry_points", return_value=return_value2): + refresh_engines() + engines = list_engines() + assert "test1" not in engines + assert "test2" in engines + assert isinstance(engines["test2"], DummyBackendEntrypoint2) + + # reset to original + refresh_engines() diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index 0f6f353faf2..36f62fad71f 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -23,11 +23,13 @@ def new_method(): @pytest.mark.parametrize( - "a, b, expected", [["a", "b", np.array(["a", "b"])], [1, 2, pd.Index([1, 2])]] + ["a", "b", "expected"], + [ + [np.array(["a"]), np.array(["b"]), np.array(["a", "b"])], + [np.array([1], dtype="int64"), np.array([2], dtype="int64"), pd.Index([1, 2])], + ], ) def test_maybe_coerce_to_str(a, b, expected): - a = np.array([a]) - b = np.array([b]) index = pd.Index(a).append(pd.Index(b)) actual = utils.maybe_coerce_to_str(index, [a, b]) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index f656818c71f..b92db16e34b 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1,6 +1,7 @@ from __future__ import annotations import warnings +from abc import ABC, abstractmethod from copy import copy, deepcopy from datetime import datetime, timedelta from textwrap import dedent @@ -61,8 +62,10 @@ def var(): return Variable(dims=list("xyz"), data=np.random.rand(3, 4, 5)) -class VariableSubclassobjects: - cls: staticmethod[Variable] +class VariableSubclassobjects(ABC): + @abstractmethod + def cls(self, *args, **kwargs) -> Variable: + raise NotImplementedError def test_properties(self): data = 0.5 * np.arange(10) @@ -204,6 +207,7 @@ def test_index_0d_string(self): x = self.cls(["x"], [value]) self._assertIndexedLikeNDArray(x, value, dtype) + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_index_0d_datetime(self): d = datetime(2000, 1, 1) x = self.cls(["x"], [d]) @@ -215,6 +219,7 @@ def test_index_0d_datetime(self): x = self.cls(["x"], pd.DatetimeIndex([d])) self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[ns]") + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_index_0d_timedelta64(self): td = timedelta(hours=1) @@ -275,6 +280,7 @@ def test_0d_time_data(self): expected = np.datetime64("2000-01-01", "ns") assert x[0].values == expected + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_datetime64_conversion(self): times = pd.date_range("2000-01-01", periods=3) for values, preserve_source in [ @@ -290,6 +296,7 @@ def test_datetime64_conversion(self): same_source = source_ndarray(v.values) is source_ndarray(values) assert preserve_source == same_source + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_timedelta64_conversion(self): times = pd.timedelta_range(start=0, periods=3) for values, preserve_source in [ @@ -310,6 +317,7 @@ def test_object_conversion(self): actual = self.cls("x", data) assert actual.dtype == data.dtype + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_datetime64_valid_range(self): data = np.datetime64("1250-01-01", "us") pderror = pd.errors.OutOfBoundsDatetime @@ -317,6 +325,7 @@ def test_datetime64_valid_range(self): self.cls(["t"], [data]) @pytest.mark.xfail(reason="pandas issue 36615") + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_timedelta64_valid_range(self): data = np.timedelta64("200000", "D") pderror = pd.errors.OutOfBoundsTimedelta @@ -455,6 +464,23 @@ def test_encoding_preserved(self): assert_identical(expected.to_base_variable(), actual.to_base_variable()) assert expected.encoding == actual.encoding + def test_reset_encoding(self) -> None: + encoding1 = {"scale_factor": 1} + # encoding set via cls constructor + v1 = self.cls(["a"], [0, 1, 2], encoding=encoding1) + assert v1.encoding == encoding1 + v2 = v1.reset_encoding() + assert v1.encoding == encoding1 + assert v2.encoding == {} + + # encoding set via setter + encoding3 = {"scale_factor": 10} + v3 = self.cls(["a"], [0, 1, 2], encoding=encoding3) + assert v3.encoding == encoding3 + v4 = v3.reset_encoding() + assert v3.encoding == encoding3 + assert v4.encoding == {} + def test_concat(self): x = np.arange(5) y = np.arange(5, 10) @@ -1033,7 +1059,8 @@ def test_rolling_window_errors(self, dim, window, window_dim, center): class TestVariable(VariableSubclassobjects): - cls = staticmethod(Variable) + def cls(self, *args, **kwargs) -> Variable: + return Variable(*args, **kwargs) @pytest.fixture(autouse=True) def setup(self): @@ -1062,6 +1089,7 @@ def test_numpy_same_methods(self): v = IndexVariable("x", np.arange(5)) assert 2 == v.searchsorted(2) + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_datetime64_conversion_scalar(self): expected = np.datetime64("2000-01-01", "ns") for values in [ @@ -1074,6 +1102,7 @@ def test_datetime64_conversion_scalar(self): assert v.values == expected assert v.values.dtype == np.dtype("datetime64[ns]") + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_timedelta64_conversion_scalar(self): expected = np.timedelta64(24 * 60 * 60 * 10**9, "ns") for values in [ @@ -1100,6 +1129,7 @@ def test_0d_datetime(self): assert v.dtype == np.dtype("datetime64[ns]") assert v.values == np.datetime64("2000-01-01", "ns") + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_0d_timedelta(self): for td in [pd.to_timedelta("1s"), np.timedelta64(1, "s")]: v = Variable([], td) @@ -1538,6 +1568,7 @@ def test_transpose(self): v.transpose(..., "not_a_dim", missing_dims="warn") assert_identical(expected_ell, actual) + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_transpose_0d(self): for value in [ 3.5, @@ -2203,7 +2234,8 @@ def test_coarsen_keep_attrs(self, operation="mean"): @requires_dask class TestVariableWithDask(VariableSubclassobjects): - cls = staticmethod(lambda *args: Variable(*args).chunk()) + def cls(self, *args, **kwargs) -> Variable: + return Variable(*args, **kwargs).chunk() def test_chunk(self): unblocked = Variable(["dim_0", "dim_1"], np.ones((3, 4))) @@ -2315,7 +2347,8 @@ def test_as_sparse(self): class TestIndexVariable(VariableSubclassobjects): - cls = staticmethod(IndexVariable) + def cls(self, *args, **kwargs) -> IndexVariable: + return IndexVariable(*args, **kwargs) def test_init(self): with pytest.raises(ValueError, match=r"must be 1-dimensional"): @@ -2519,6 +2552,7 @@ def test_masked_array(self): assert_array_equal(expected, actual) assert np.dtype(float) == actual.dtype + @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_datetime(self): expected = np.datetime64("2000-01-01") actual = as_compatible_data(expected)