From 077276a800cdadebffd6768a9a4ba7d0e450a003 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 20 Nov 2024 17:53:51 -0600 Subject: [PATCH] Add utility for opening remote files with ``fsspec`` (#9797) * Add utility for opening remote files with fsspec * Apply Joe's suggestions from code review Co-authored-by: Joe Hamman * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Lint * Add what's new entry * Type hint * Make mypy happy --------- Co-authored-by: Joe Hamman Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/whats-new.rst | 3 +++ xarray/backends/common.py | 9 +++++++++ xarray/backends/h5netcdf_.py | 12 +++++++++++- xarray/tests/test_backends.py | 21 +++++++++++++++++++++ 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6bfb1238935..3a04467d483 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -57,6 +57,9 @@ New Features - Optimize :py:meth:`DataArray.polyfit` and :py:meth:`Dataset.polyfit` with dask, when used with arrays with more than two dimensions. (:issue:`5629`). By `Deepak Cherian `_. +- Support for directly opening remote files as string paths (for example, ``s3://bucket/data.nc``) + with ``fsspec`` when using the ``h5netcdf`` engine (:issue:`9723`, :pull:`9797`). + By `James Bourbeau `_. - Re-implement the :py:mod:`ufuncs` module, which now dynamically dispatches to the underlying array's backend. Provides better support for certain wrapped array types like ``jax.numpy.ndarray``. (:issue:`7848`, :pull:`9776`). diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 8f2c9e8b1aa..3756de90b60 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -181,6 +181,15 @@ def _normalize_path_list( return _normalize_path_list(paths) +def _open_remote_file(file, mode, storage_options=None): + import fsspec + + fs, _, paths = fsspec.get_fs_token_paths( + file, mode=mode, storage_options=storage_options + ) + return fs.open(paths[0], mode=mode) + + def _encode_variable_name(name): if name is None: name = NONE_VAR_NAME diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 95cc1a1e93d..717ee48db3b 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -13,6 +13,7 @@ BackendEntrypoint, WritableCFDataStore, _normalize_path, + _open_remote_file, datatree_from_dict_with_io_cleanup, find_root_and_group, ) @@ -149,9 +150,16 @@ def open( decode_vlen_strings=True, driver=None, driver_kwds=None, + storage_options: dict[str, Any] | None = None, ): import h5netcdf + if isinstance(filename, str) and is_remote_uri(filename) and driver is None: + mode_ = "rb" if mode == "r" else mode + filename = _open_remote_file( + filename, mode=mode_, storage_options=storage_options + ) + if isinstance(filename, bytes): raise ValueError( "can't open netCDF4/HDF5 as bytes " @@ -161,7 +169,7 @@ def open( magic_number = read_magic_number_from_file(filename) if not magic_number.startswith(b"\211HDF\r\n\032\n"): raise ValueError( - f"{magic_number} is not the signature of a valid netCDF4 file" + f"{magic_number!r} is not the signature of a valid netCDF4 file" ) if format not in [None, "NETCDF4"]: @@ -425,6 +433,7 @@ def open_dataset( decode_vlen_strings=True, driver=None, driver_kwds=None, + storage_options: dict[str, Any] | None = None, ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) store = H5NetCDFStore.open( @@ -437,6 +446,7 @@ def open_dataset( decode_vlen_strings=decode_vlen_strings, driver=driver, driver_kwds=driver_kwds, + storage_options=storage_options, ) store_entrypoint = StoreBackendEntrypoint() diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index fd866cae5ee..9e150507372 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6489,3 +6489,24 @@ def test_zarr_safe_chunk_region(tmp_path): chunk = ds.isel(region) chunk = chunk.chunk() chunk.chunk().to_zarr(store, region=region) + + +@requires_h5netcdf +@requires_fsspec +def test_h5netcdf_storage_options() -> None: + with create_tmp_files(2, allow_cleanup_failure=ON_WINDOWS) as (f1, f2): + ds1 = create_test_data() + ds1.to_netcdf(f1, engine="h5netcdf") + + ds2 = create_test_data() + ds2.to_netcdf(f2, engine="h5netcdf") + + files = [f"file://{f}" for f in [f1, f2]] + ds = xr.open_mfdataset( + files, + engine="h5netcdf", + concat_dim="time", + combine="nested", + storage_options={"skip_instance_cache": False}, + ) + assert_identical(xr.concat([ds1, ds2], dim="time"), ds)