diff --git a/CHANGELOG.md b/CHANGELOG.md index c5cfa5a9e..e2161a28b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,9 @@ - Fix bug where namespaces were loaded in "w-" mode. @h-mayorquin [#1795](https://github.com/NeurodataWithoutBorders/pynwb/pull/1795) - Fix bug where pynwb version was reported as "unknown" to readthedocs @stephprince [#1810](https://github.com/NeurodataWithoutBorders/pynwb/pull/1810) +### Documentation and tutorial enhancements +- Add RemFile to streaming tutorial @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761) + ## PyNWB 2.5.0 (August 18, 2023) ### Enhancements and minor changes diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index fb34b498b..101400c2d 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -90,6 +90,9 @@ # `fsspec documentation on known implementations `_ # for a full updated list of supported store formats. # +# One downside of this fsspec method is that fsspec is not optimized for reading HDF5 files, and so streaming data +# using this method can be slow. A faster alternative is ``remfile`` described below. +# # Streaming Method 2: ROS3 # ------------------------ # ROS3 stands for "read only S3" and is a driver created by the HDF5 Group that allows HDF5 to read HDF5 files stored @@ -120,15 +123,43 @@ # # pip uninstall h5py # conda install -c conda-forge "h5py>=3.2" +# +# Besides the extra burden of installing h5py from a non-PyPI source, one downside of this ROS3 method is that +# this method does not support automatic retries in case the connection fails. + + +################################################## +# Method 3: remfile +# ----------------- +# ``remfile`` is another library that enables indexing and streaming of files in s3. remfile is simple, fast, and +# allows for caching of data in the local filesystem. The caveats of ``remfile`` are that it is a very new project +# that has not been tested in a variety of use-cases and caching options are limited compared to ``fsspec``. +# You can install ``remfile`` with pip: +# +# .. code-block:: bash +# +# pip install remfile +# + +import h5py +from pynwb import NWBHDF5IO +import remfile + +rem_file = remfile.File(s3_url) +with h5py.File(rem_file, "r") as h5py_file: + with NWBHDF5IO(file=h5py_file, load_namespaces=True) as io: + nwbfile = io.read() + print(nwbfile.acquisition["lick_times"].time_series["lick_left_times"].data[:]) ################################################## # Which streaming method to choose? # --------------------------------- # # From a user perspective, once opened, the :py:class:`~pynwb.file.NWBFile` works the same with -# both fsspec and ros3. However, in general, we currently recommend using fsspec for streaming -# NWB files because it is more performant and reliable than ros3. In particular fsspec: +# fsspec, ros3, or remfile. However, in general, we currently recommend using fsspec for streaming +# NWB files because it is more performant and reliable than ros3 and more widely tested than remfile. +# In particular, fsspec: # # 1. supports caching, which will dramatically speed up repeated requests for the # same region of data, diff --git a/environment-ros3.yml b/environment-ros3.yml index 2bf2678d2..c84b4c090 100644 --- a/environment-ros3.yml +++ b/environment-ros3.yml @@ -16,3 +16,6 @@ dependencies: - fsspec==2023.6.0 - requests==2.28.1 - aiohttp==3.8.3 + - pip + - pip: + - remfile==0.1.9