diff --git a/CHANGELOG.md b/CHANGELOG.md index d3f014d36..acf3d174e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,8 @@ [#1591](https://github.com/NeurodataWithoutBorders/pynwb/pull/1591) - Updated citation for PyNWB in docs and duecredit to use the eLife NWB paper. @oruebel [#1604](https://github.com/NeurodataWithoutBorders/pynwb/pull/1604) - Fixed docs build warnings due to use of hardcoded links. @oruebel [#1604](https://github.com/NeurodataWithoutBorders/pynwb/pull/1604) +- Updated the [iterative write tutorial](https://pynwb.readthedocs.io/en/stable/tutorials/advanced_io/iterative_write.html) to reference the new ``GenericDataChunkIterator`` functionality and use the new ``H5DataIO.dataset`` property to simplify the custom I/O section. @oruebel [#1633](https://github.com/NeurodataWithoutBorders/pynwb/pull/1633) +- Updated the [parallel I/O tutorial](https://pynwb.readthedocs.io/en/stable/tutorials/advanced_io/parallelio.html) to use the new ``H5DataIO.dataset`` feature to set up an empty dataset for parallel write. @oruebel [#1633](https://github.com/NeurodataWithoutBorders/pynwb/pull/1633) ### Bug fixes - Added shape constraint to `PatchClampSeries.data`. @bendichter diff --git a/docs/Makefile b/docs/Makefile index 4cb0b9d41..80492bbf2 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -9,6 +9,7 @@ PAPER = BUILDDIR = _build SRCDIR = ../src RSTDIR = source +GALLERYDIR = gallery PKGNAME = pynwb # Internal variables. @@ -45,7 +46,7 @@ help: @echo " apidoc to build RST from source code" clean: - -rm -rf $(BUILDDIR)/* $(RSTDIR)/$(PKGNAME)*.rst $(RSTDIR)/tutorials + -rm -rf $(BUILDDIR)/* $(RSTDIR)/$(PKGNAME)*.rst $(RSTDIR)/tutorials $(GALLERYDIR)/advanced_io/*.npy $(GALLERYDIR)/advanced_io/*.nwb html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html diff --git a/docs/gallery/advanced_io/parallelio.py b/docs/gallery/advanced_io/parallelio.py index a91e567d5..39fed657a 100644 --- a/docs/gallery/advanced_io/parallelio.py +++ b/docs/gallery/advanced_io/parallelio.py @@ -30,7 +30,7 @@ # from dateutil import tz # from pynwb import NWBHDF5IO, NWBFile, TimeSeries # from datetime import datetime -# from hdmf.data_utils import DataChunkIterator +# from hdmf.backends.hdf5.h5_utils import H5DataIO # # start_time = datetime(2018, 4, 25, 2, 30, 3, tzinfo=tz.gettz('US/Pacific')) # fname = 'test_parallel_pynwb.nwb' @@ -40,9 +40,11 @@ # # write in parallel but we do not write any data # if rank == 0: # nwbfile = NWBFile('aa', 'aa', start_time) -# data = DataChunkIterator(data=None, maxshape=(4,), dtype=np.dtype('int')) +# data = H5DataIO(shape=(4,), +# maxshape=(4,), +# dtype=np.dtype('int')) # -# nwbfile.add_acquisition(TimeSeries('ts_name', description='desc', data=data, +# nwbfile.add_acquisition(TimeSeries(name='ts_name', description='desc', data=data, # rate=100., unit='m')) # with NWBHDF5IO(fname, 'w') as io: # io.write(nwbfile) @@ -58,24 +60,9 @@ # print(io.read().acquisition['ts_name'].data[rank]) #################### -# To specify details about chunking, compression and other HDF5-specific I/O options, -# we can wrap data via ``H5DataIO``, e.g, # -# .. code-block:: python -# -# data = H5DataIO(DataChunkIterator(data=None, maxshape=(100000, 100), -# dtype=np.dtype('float')), -# chunks=(10, 10), maxshape=(None, None)) +# .. note:: # -# would initialize your dataset with a shape of (100000, 100) and maxshape of (None, None) -# and your own custom chunking of (10, 10). - -#################### -# Disclaimer -# ---------------- +# Using :py:class:`hdmf.backends.hdf5.h5_utils.H5DataIO` we can also specify further +# details about the data layout, e.g., via the chunking and compression parameters. # -# External links included in the tutorial are being provided as a convenience and for informational purposes only; -# they do not constitute an endorsement or an approval by the authors of any of the products, services or opinions of -# the corporation or organization or individual. The authors bear no responsibility for the accuracy, legality or -# content of the external site or for that of subsequent links. Contact the external site for answers to questions -# regarding its content. diff --git a/docs/gallery/advanced_io/iterative_write.py b/docs/gallery/advanced_io/plot_iterative_write.py similarity index 89% rename from docs/gallery/advanced_io/iterative_write.py rename to docs/gallery/advanced_io/plot_iterative_write.py index 26f7d1a9d..3884c333a 100644 --- a/docs/gallery/advanced_io/iterative_write.py +++ b/docs/gallery/advanced_io/plot_iterative_write.py @@ -42,6 +42,7 @@ # * **Data generators** Data generators are in many ways similar to data streams only that the # data is typically being generated locally and programmatically rather than from an external # data source. +# # * **Sparse data arrays** In order to reduce storage size of sparse arrays a challenge is that while # the data array (e.g., a matrix) may be large, only few values are set. To avoid storage overhead # for storing the full array we can employ (in HDF5) a combination of chunking, compression, and @@ -71,6 +72,13 @@ # This is useful for buffered I/O operations, e.g., to improve performance by accumulating data in memory and # writing larger blocks at once. # +# * :py:class:`~hdmf.data_utils.GenericDataChunkIterator` is a semi-abstract version of a +# :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` that automatically handles the selection of +# buffer regions and resolves communication of compatible chunk regions. Users specify chunk +# and buffer shapes or sizes and the iterator will manage how to break the data up for write. +# For further details, see the +# :hdmf-docs:`GenericDataChunkIterator tutorial `. +# #################### # Iterative Data Write: API @@ -107,11 +115,15 @@ from pynwb import NWBHDF5IO -def write_test_file(filename, data): +def write_test_file(filename, data, close_io=True): """ + Simple helper function to write an NWBFile with a single timeseries containing data :param filename: String with the name of the output file :param data: The data of the timeseries + :param close_io: Close and destroy the NWBHDF5IO object used for writing (default=True) + + :returns: None if close_io==True otherwise return NWBHDF5IO object used for write """ # Create a test NWBfile @@ -133,7 +145,11 @@ def write_test_file(filename, data): # Write the data to file io = NWBHDF5IO(filename, 'w') io.write(nwbfile) - io.close() + if close_io: + io.close() + del io + io = None + return io #################### @@ -196,12 +212,6 @@ def iter_sin(chunk_length=10, max_chunks=100): str(data.dtype))) #################### -# ``[Out]:`` -# -# .. code-block:: python -# -# maxshape=(None, 10), recommended_data_shape=(1, 10), dtype=float64 -# # As we can see :py:class:`~hdmf.data_utils.DataChunkIterator` automatically recommends # in its ``maxshape`` that the first dimensions of our array should be unlimited (``None``) and the second # dimension be ``10`` (i.e., the length of our chunk. Since :py:class:`~hdmf.data_utils.DataChunkIterator` @@ -216,8 +226,11 @@ def iter_sin(chunk_length=10, max_chunks=100): # :py:class:`~hdmf.data_utils.DataChunkIterator` assumes that our generators yields in **consecutive order** # **single** complete element along the **first dimension** of our a array (i.e., iterate over the first # axis and yield one-element-at-a-time). This behavior is useful in many practical cases. However, if -# this strategy does not match our needs, then you can alternatively implement our own derived -# :py:class:`~hdmf.data_utils.AbstractDataChunkIterator`. We show an example of this next. +# this strategy does not match our needs, then using :py:class:`~hdmf.data_utils.GenericDataChunkIterator` +# or implementing your own derived :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` may be more +# appropriate. We show an example of how to implement your own :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` +# next. See the :hdmf-docs:`GenericDataChunkIterator tutorial ` as +# part of the HDMF documentation for details on how to use :py:class:`~hdmf.data_utils.GenericDataChunkIterator`. # @@ -387,26 +400,6 @@ def maxshape(self): print(" Reduction : %.2f x" % (expected_size / file_size_largechunks_compressed)) #################### -# ``[Out]:`` -# -# .. code-block:: python -# -# 1) Sparse Matrix Size: -# Expected Size : 8000000.00 MB -# Occupied Size : 0.80000 MB -# 2) NWB HDF5 file (no compression): -# File Size : 0.89 MB -# Reduction : 9035219.28 x -# 3) NWB HDF5 file (with GZIP compression): -# File Size : 0.88847 MB -# Reduction : 9004283.79 x -# 4) NWB HDF5 file (large chunks): -# File Size : 80.08531 MB -# Reduction : 99893.47 x -# 5) NWB HDF5 file (large chunks with compression): -# File Size : 1.14671 MB -# Reduction : 6976450.12 x -# # Discussion # ^^^^^^^^^^ # @@ -490,7 +483,7 @@ def maxshape(self): # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # Note, we here use a generator for simplicity but we could equally well also implement our own -# :py:class:`~hdmf.data_utils.AbstractDataChunkIterator`. +# :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` or use :py:class:`~hdmf.data_utils.GenericDataChunkIterator`. def iter_largearray(filename, shape, dtype='float64'): @@ -553,15 +546,6 @@ def iter_largearray(filename, shape, dtype='float64'): else: print("ERROR: Mismatch between data") - -#################### -# ``[Out]:`` -# -# .. code-block:: python -# -# Success: All data values match - - #################### # Example: Convert arrays stored in multiple files # ----------------------------------------------------- @@ -705,46 +689,37 @@ def maxshape(self): # from hdmf.backends.hdf5.h5_utils import H5DataIO -write_test_file(filename='basic_alternative_custom_write.nwb', - data=H5DataIO(data=np.empty(shape=(0, 10), dtype='float'), - maxshape=(None, 10), # <-- Make the time dimension resizable - chunks=(131072, 2), # <-- Use 2MB chunks - compression='gzip', # <-- Enable GZip compression - compression_opts=4, # <-- GZip aggression - shuffle=True, # <-- Enable shuffle filter - fillvalue=np.nan # <-- Use NAN as fillvalue - ) - ) +# Use H5DataIO to specify how to setup the dataset in the file +dataio = H5DataIO( + shape=(0, 10), # Initial shape. If the shape is known then set to full shape + dtype=np.dtype('float'), # dtype of the dataset + maxshape=(None, 10), # Make the time dimension resizable + chunks=(131072, 2), # Use 2MB chunks + compression='gzip', # Enable GZip compression + compression_opts=4, # GZip aggression + shuffle=True, # Enable shuffle filter + fillvalue=np.nan # Use NAN as fillvalue +) + +# Write a test NWB file with our dataset and keep the NWB file (i.e., the NWBHDF5IO object) open +io = write_test_file( + filename='basic_alternative_custom_write.nwb', + data=dataio, + close_io=False +) #################### # Step 2: Get the dataset(s) to be updated # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # -from pynwb import NWBHDF5IO # noqa - -io = NWBHDF5IO('basic_alternative_custom_write.nwb', mode='a') -nwbfile = io.read() -data = nwbfile.get_acquisition('synthetic_timeseries').data - -# Let's check what the data looks like -print("Shape %s, Chunks: %s, Maxshape=%s" % (str(data.shape), str(data.chunks), str(data.maxshape))) - -#################### -# ``[Out]:`` -# -# .. code-block:: python -# -# Shape (0, 10), Chunks: (131072, 2), Maxshape=(None, 10) -# -#################### -# Step 3: Implement custom write -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# Let's check what the data looks like before we write +print("Before write: Shape= %s, Chunks= %s, Maxshape=%s" % + (str(dataio.dataset.shape), str(dataio.dataset.chunks), str(dataio.dataset.maxshape))) -data.resize((8, 10)) # <-- Allocate the space with need -data[0:3, :] = 1 # <-- Write timesteps 0,1,2 -data[3:6, :] = 2 # <-- Write timesteps 3,4,5, Note timesteps 6,7 are not being initialized +dataio.dataset.resize((8, 10)) # <-- Allocate space. Only needed if we didn't set the initial shape large enough +dataio.dataset[0:3, :] = 1 # <-- Write timesteps 0,1,2 +dataio.dataset[3:6, :] = 2 # <-- Write timesteps 3,4,5, Note timesteps 6,7 are not being initialized io.close() # <-- Close the file @@ -756,20 +731,13 @@ def maxshape(self): io = NWBHDF5IO('basic_alternative_custom_write.nwb', mode='a') nwbfile = io.read() -data = nwbfile.get_acquisition('synthetic_timeseries').data -print(data[:]) +dataset = nwbfile.get_acquisition('synthetic_timeseries').data +print("After write: Shape= %s, Chunks= %s, Maxshape=%s" % + (str(dataset.shape), str(dataset.chunks), str(dataset.maxshape))) +print(dataset[:]) io.close() #################### -# ``[Out]:`` -# -# .. code-block:: python -# -# [[ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] -# [ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] -# [ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] -# [ 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.] -# [ 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.] -# [ 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.] -# [ nan nan nan nan nan nan nan nan nan nan] -# [ nan nan nan nan nan nan nan nan nan nan]] +# We allocated our data to be ``shape=(8, 10)`` but we only wrote data to the first 6 rows of the +# array. As expected, we therefore, see our ``fillvalue`` of ``nan`` in the last two rows of the data. +# diff --git a/docs/gallery/general/read_basics.py b/docs/gallery/general/read_basics.py index 30dd93285..13fa3e9b7 100644 --- a/docs/gallery/general/read_basics.py +++ b/docs/gallery/general/read_basics.py @@ -331,7 +331,7 @@ # object and accessing its attributes, but it may be useful to explore the data in a # more interactive, visual way. # -# You can use `NWBWidgets `_, +# You can use `NWBWidgets `_, # a package containing interactive widgets for visualizing NWB data, # or you can use the `HDFView `_ # tool, which can open any generic HDF5 file, which an NWB file is. diff --git a/docs/make.bat b/docs/make.bat index 1e2a19ff4..dcafe003d 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -10,6 +10,7 @@ if "%SPHINXAPIDOC%" == "" ( ) set BUILDDIR=_build set RSTDIR=source +set GALLERYDIR=gallery set SRCDIR=../src set PKGNAME=pynwb set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% %RSTDIR% @@ -51,6 +52,8 @@ if "%1" == "clean" ( del /q /s %BUILDDIR%\* del /q %RSTDIR%\%PKGNAME%*.rst rmdir /q /s %RSTDIR%\tutorials + del /q /s %GALLERYDIR%\advanced_io\*.npy + del /q /s %GALLERYDIR%\advanced_io\*.nwb goto end ) diff --git a/docs/source/conf.py b/docs/source/conf.py index ca7131e3c..8cd05198b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -151,6 +151,7 @@ def __call__(self, filename): 'nwb_extension': ('https://github.com/nwb-extensions/%s', ''), 'pynwb': ('https://github.com/NeurodataWithoutBorders/pynwb/%s', ''), 'nwb_overview': ('https://nwb-overview.readthedocs.io/en/latest/%s', ''), + 'hdmf-docs': ('https://hdmf.readthedocs.io/en/stable/%s', ''), 'dandi': ('https://www.dandiarchive.org/%s', '')} # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/index.rst b/docs/source/index.rst index a2651db27..1f6883d1c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -11,7 +11,7 @@ efficiently working with neurodata stored in the NWB format. If you are new to N and would like to learn more, then please also visit the :nwb_overview:`NWB Overview <>` website, which provides an entry point for researchers and developers interested in using NWB. -`Neurodata Without Borders (NWB) `_ is a project to develop a +`Neurodata Without Borders (NWB) `_ is a project to develop a unified data format for cellular-based neurophysiology data, focused on the dynamics of groups of neurons measured under a large range of experimental conditions. diff --git a/docs/source/software_process.rst b/docs/source/software_process.rst index 6c2e34cb3..07e809c9f 100644 --- a/docs/source/software_process.rst +++ b/docs/source/software_process.rst @@ -30,7 +30,7 @@ codecov_, and the other badge shows the percentage coverage reported from codeco codecov_, which shows line by line which lines are covered by the tests. .. _coverage: https://coverage.readthedocs.io -.. _codecov: https://codecov.io/gh/NeurodataWithoutBorders/pynwb/tree/dev/src/pynwb +.. _codecov: https://app.codecov.io/gh/NeurodataWithoutBorders/pynwb/tree/dev/src/pynwb -------------------------- Requirement Specifications