diff --git a/docs/notebooks/zarr_file_conversion_test.ipynb b/docs/notebooks/zarr_file_conversion_test.ipynb new file mode 100644 index 000000000..25e59b47a --- /dev/null +++ b/docs/notebooks/zarr_file_conversion_test.ipynb @@ -0,0 +1,291 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b03b7582", + "metadata": {}, + "source": [ + "# Installing the Zarr Backend for NWB" + ] + }, + { + "cell_type": "markdown", + "id": "6bd6b4c2", + "metadata": {}, + "source": [ + "```\n", + "conda create -n nwbzarr python=3.9\n", + "conda activate nwbzarr\n", + "conda install Cython\n", + "conda install numpy==1.21.0\n", + "conda install pkgconfig\n", + "conda install h5py=3.3.0\n", + "conda install pandas==1.3.0\n", + "conda install python-dateutil==2.8.1\n", + "git clone --recurse-submodules https://github.com/NeurodataWithoutBorders/pynwb.git\n", + "cd pynwb\n", + "git checkout add/zarrio\n", + "pip install -e .\n", + "pip uninstall hdmf\n", + "git clone --recurse-submodules https://github.com/hdmf-dev/hdmf.git\n", + "cd hdmf\n", + "git checkout 1.0.3-zarr\n", + "conda install --file requirements.txt\n", + "pip install -e .\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "96a1c56c", + "metadata": {}, + "source": [ + "To use this notebook, the following optional package should also be installed\n", + "\n", + "```\n", + "pip install dandi\n", + "conda install jupyter\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "41b2187e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import shutil" + ] + }, + { + "cell_type": "markdown", + "id": "d5841dfd", + "metadata": {}, + "source": [ + "# Download a file from DANDI" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ac7df3b3", + "metadata": {}, + "outputs": [], + "source": [ + "from dandi.dandiapi import DandiAPIClient\n", + "\n", + "dandiset_id = '000207' \n", + "filepath = \"sub-1/sub-1_ses-1_ecephys+image.nwb\" # 5 MB file\n", + "with DandiAPIClient() as client:\n", + " asset = client.get_dandiset(dandiset_id, 'draft').get_asset_by_path(filepath)\n", + " s3_path = asset.get_content_url(follow_redirects=1, strip_query=True)\n", + " filename = os.path.basename(asset.path)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7d2b53df", + "metadata": {}, + "outputs": [], + "source": [ + "asset.download(filename)" + ] + }, + { + "cell_type": "markdown", + "id": "7b37360b", + "metadata": {}, + "source": [ + "# Define output settings and clean up old files " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "854dc0ae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Removing test_hdf5_sub-1_ses-1_ecephys+image.nwb\n", + "Removing test_zarr_sub-1_ses-1_ecephys+image.nwb\n" + ] + } + ], + "source": [ + "zarr_filename = \"test_zarr_\" + filename\n", + "hdf_filename = \"test_hdf5_\" + filename\n", + "\n", + "# Delete our converted HDF5 file from previous runs of this notebook\n", + "if os.path.exists(hdf_filename):\n", + " print(\"Removing %s\" % hdf_filename)\n", + " os.remove(hdf_filename)\n", + "# Delete our converted Zarr file from previous runs of this notebook\n", + "if os.path.exists(zarr_filename):\n", + " print(\"Removing %s\" % zarr_filename)\n", + " shutil.rmtree(zarr_filename)" + ] + }, + { + "cell_type": "markdown", + "id": "240691c5", + "metadata": {}, + "source": [ + "# Convert the file to Zarr" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "981df005", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/oruebel/Devel/nwb/zarr/hdmf/src/hdmf/backends/zarr/zarr_tools.py:78: UserWarning: \u001b[91mThe ZarrIO backend is experimental. It is under active development. The ZarrIO backend may change or be removed at any time and backward compatibility is not guaranteed.\u001b[0m\n", + " warnings.warn(warn_msg)\n" + ] + } + ], + "source": [ + "from pynwb import NWBHDF5IO, NWBZarrIO\n", + "\n", + "with NWBHDF5IO(filename , 'r', load_namespaces=False) as read_io:\n", + " with NWBZarrIO(zarr_filename, mode='w', chunking=True) as export_io:\n", + " export_io.export(src_io=read_io, write_args=dict(link_data=False))" + ] + }, + { + "cell_type": "markdown", + "id": "cd15edcc", + "metadata": {}, + "source": [ + "# Read the Zarr file back in" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2d8aa004", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/oruebel/Devel/nwb/zarr/pynwb/src/pynwb/base.py:167: UserWarning: Length of data does not match length of timestamps. Your data may be transposed. Time should be on the 0th dimension\n", + " warn(\"Length of data does not match length of timestamps. Your data may be transposed. Time should be on \"\n" + ] + } + ], + "source": [ + "zr = NWBZarrIO(zarr_filename, 'r')\n", + "zf = zr.read()" + ] + }, + { + "cell_type": "markdown", + "id": "b5b97c8f", + "metadata": {}, + "source": [ + "# Convert the Zarr file back to HDF5" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c74a470e", + "metadata": {}, + "outputs": [], + "source": [ + "with NWBZarrIO(zarr_filename, mode='r') as read_io:\n", + " with NWBHDF5IO(hdf_filename , 'w') as export_io:\n", + " export_io.export(src_io=read_io, write_args=dict(link_data=False))" + ] + }, + { + "cell_type": "markdown", + "id": "f1cc2427", + "metadata": {}, + "source": [ + "# Read the new HDF5 file back\n", + "\n", + "Now our file has been converted from HDF5 to Zarr and back again to HDF5. Here we check that we can stil read that file" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "51f008f2", + "metadata": {}, + "outputs": [], + "source": [ + "with NWBHDF5IO(hdf_filename , 'r') as hr:\n", + " hf = hr.read()" + ] + }, + { + "cell_type": "markdown", + "id": "a6a56195", + "metadata": {}, + "source": [ + "# Notes\n", + "\n", + "The ZarrIO backend for NWB is under development as part of the following PRs on GitHub:\n", + "\n", + "* **HDMF**: https://github.com/hdmf-dev/hdmf/pull/696\n", + "\n", + " * Related PR: https://github.com/hdmf-dev/hdmf/pull/697 This PR includes all of the general changes to HDMF that we did to implement the Zarr backend. Once #697 is merged #696 should be agains synced with dev, so that the PR then only includes the changes to add Zarr itself.\n", + " \n", + "* **PyNWB**: https://github.com/NeurodataWithoutBorders/pynwb/pull/1018" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6bbed78", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ffda2b3", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/pynwb/__init__.py b/src/pynwb/__init__.py index f91ffc0d7..2b92d3561 100644 --- a/src/pynwb/__init__.py +++ b/src/pynwb/__init__.py @@ -222,8 +222,7 @@ def __init__(self, **kwargs): warn("loading namespaces from file - ignoring 'manager'") if extensions is not None: warn("loading namespaces from file - ignoring 'extensions' argument") - # namespaces are not loaded when creating an NWBHDF5IO object in write mode - if 'w' in mode or mode == 'x': + if 'w' in mode or mode == 'x': # namespaces are not loaded in write mode raise ValueError("cannot load namespaces from file when writing to it") tm = get_type_map() @@ -257,6 +256,73 @@ def export(self, **kwargs): call_docval_func(super().export, kwargs) +try: + from hdmf.backends.zarr.zarr_tools import ZarrIO as _ZarrIO + import zarr + + class NWBZarrIO(_ZarrIO): + + @docval({'name': 'path', 'type': str, 'doc': 'the path to the Zarr file'}, + {'name': 'mode', 'type': str, + 'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "w-")'}, + {'name': 'load_namespaces', 'type': bool, + 'doc': 'whether or not to load cached namespaces from given path - not applicable in write mode', + 'default': False}, + {'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', + 'default': None}, + {'name': 'extensions', 'type': (str, TypeMap, list), + 'doc': 'a path to a namespace, a TypeMap, or a list consisting paths to namespaces and TypeMaps', + 'default': None}, + {'name': 'synchronizer', 'type': (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool), + 'doc': 'Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.', + 'default': None}, + {'name': 'chunking', 'type': bool, 'doc': "Enable/Disable chunking of datasets by default", + 'default': True}) + def __init__(self, **kwargs): + path, mode, manager, extensions, load_namespaces, synchronizer, chunking = \ + popargs('path', 'mode', 'manager', 'extensions', + 'load_namespaces', 'synchronizer', 'chunking', kwargs) + if load_namespaces: + if manager is not None: + warn("loading namespaces from file - ignoring 'manager'") + if extensions is not None: + warn("loading namespaces from file - ignoring 'extensions' argument") + # namespaces are not loaded when creating an NWBZarrIO object in write mode + if 'w' in mode or mode == 'x': + raise ValueError("cannot load namespaces from file when writing to it") + + tm = get_type_map() + super(NWBZarrIO, self).load_namespaces(tm, path) + manager = BuildManager(tm) + else: + if manager is not None and extensions is not None: + raise ValueError("'manager' and 'extensions' cannot be specified together") + elif extensions is not None: + manager = get_manager(extensions=extensions) + elif manager is None: + manager = get_manager() + super(NWBZarrIO, self).__init__(path, + manager=manager, + mode=mode, + synchronizer=synchronizer, + chunking=chunking) + + @docval({'name': 'src_io', 'type': HDMFIO, 'doc': 'the HDMFIO object for reading the data to export'}, + {'name': 'nwbfile', 'type': 'NWBFile', + 'doc': 'the NWBFile object to export. If None, then the entire contents of src_io will be exported', + 'default': None}, + {'name': 'write_args', 'type': dict, 'doc': 'arguments to pass to :py:meth:`write_builder`', + 'default': dict()}) + def export(self, **kwargs): + nwbfile = popargs('nwbfile', kwargs) + kwargs['container'] = nwbfile + call_docval_func(super().export, kwargs) + +except ImportError: + from warnings import info + info("Ignoring optional NWBZarrIO. Zarr is not installed") + + from . import io as __io # noqa: F401,E402 from .core import NWBContainer, NWBData # noqa: F401,E402 from .base import TimeSeries, ProcessingModule # noqa: F401,E402 diff --git a/tests/integration/hdf5/test_io.py b/tests/integration/hdf5/test_io.py index 30e939c9b..92a15ad1e 100644 --- a/tests/integration/hdf5/test_io.py +++ b/tests/integration/hdf5/test_io.py @@ -6,8 +6,8 @@ from pynwb import NWBFile, TimeSeries, get_manager, NWBHDF5IO, validate -from hdmf.backends.io import UnsupportedOperation from hdmf.backends.hdf5 import HDF5IO, H5DataIO +from hdmf.backends.io import UnsupportedOperation from hdmf.data_utils import DataChunkIterator from hdmf.build import GroupBuilder, DatasetBuilder from hdmf.spec import NamespaceCatalog @@ -92,7 +92,7 @@ def test_write_cache_spec(self): Round-trip test for writing spec and reading it back in ''' with HDF5IO(self.path, manager=self.manager, mode="a") as io: - io.write(self.container) + io.write(self.container, cache_spec=True) with File(self.path, 'r') as f: self.assertIn('specifications', f) @@ -203,7 +203,7 @@ def test_write_cache_spec(self): with File(self.path, 'w') as fil: with HDF5IO(self.path, manager=self.manager, file=fil, mode='a') as io: - io.write(self.container) + io.write(self.container, cache_spec=True) with File(self.path, 'r') as f: self.assertIn('specifications', f) diff --git a/tests/integration/hdf5/test_modular_storage.py b/tests/integration/hdf5/test_modular_storage.py index db1608865..c995317ff 100644 --- a/tests/integration/hdf5/test_modular_storage.py +++ b/tests/integration/hdf5/test_modular_storage.py @@ -14,6 +14,12 @@ class TestTimeSeriesModular(TestCase): def setUp(self): + test_case_name = str(self.id()).split(".")[-1] + if test_case_name == "test_zarr_roundtrip": + self.skipTest("Modular storage testing does not apply to ZarrIO") + + self.__manager = get_manager() + self.start_time = datetime(1971, 1, 1, 12, tzinfo=tzutc()) self.data = np.arange(2000).reshape((2, 1000)) diff --git a/tests/integration/ui_write/base.py b/tests/integration/ui_write/base.py new file mode 100644 index 000000000..b8f760da0 --- /dev/null +++ b/tests/integration/ui_write/base.py @@ -0,0 +1,271 @@ +import unittest2 as unittest +from datetime import datetime +from dateutil.tz import tzlocal, tzutc +import os +import numpy as np +import h5py +import shutil +import numpy.testing as npt + +from pynwb import NWBContainer, get_manager, NWBFile, NWBData, NWBHDF5IO, NWBZarrIO, validate as pynwb_validate +from hdmf.backends.hdf5 import HDF5IO +from hdmf.backends.zarr import ZarrIO + +from zarr.core import Array as ZarrArray + +CORE_NAMESPACE = 'core' + +container_tests = dict() + + +def container_test(container): + global container_tests + + def _dec(cls): + container_tests[container] = cls + return cls + return _dec + + +class TestMapNWBContainer(unittest.TestCase): + + _required_tests = ('test_build', 'test_construct') + + def setUp(self): + self.__manager = get_manager() + self.container = self.setUpContainer() + + @property + def required_tests(self): + return self._required_tests + + @property + def manager(self): + return self.__manager + + @unittest.skip("deprecated") + def test_build(self): + """ + As of 20190110, this test has been deprecated. Maintaining hardcoded builder objects has become + increasingly difficult, and offers little in the way of debugging and identifying problems + """ + try: + self.builder = self.setUpBuilder() + except unittest.SkipTest: + raise unittest.SkipTest("cannot run construct test for %s -- setUpBuilder not implemented" % + self.__class__.__name__) + self.maxDiff = None + result = self.manager.build(self.container) + # do something here to validate the result Builder against the spec + self.assertDictEqual(result, self.builder) + + @unittest.skip("deprecated") + def test_construct(self): + """ + As of 20190110, this test has been deprecated. Maintaining hardcoded builder objects has become + increasingly difficult, and offers little in the way of debugging and identifying problems + """ + try: + self.builder = self.setUpBuilder() + except unittest.SkipTest: + raise unittest.SkipTest("cannot run construct test for %s -- setUpBuilder not implemented" % + self.__class__.__name__) + result = self.manager.construct(self.builder) + self.assertContainerEqual(result, self.container) + + def setUpBuilder(self): + ''' Should return the Builder that represents the Container''' + raise unittest.SkipTest('Cannot run test unless setUpBuilder is implemented') + + def setUpContainer(self): + ''' Should return the Container to build and read/write''' + raise unittest.SkipTest('Cannot run test unless setUpContainer is implemented') + + def assertContainerEqual(self, container1, container2): # noqa: C901 + ''' + container1 is what was read or generated + container2 is what is hardcoded in the TestCase + ''' + type1 = type(container1) + type2 = type(container2) + self.assertEqual(type1, type2) + for nwbfield in container1.__nwbfields__: + with self.subTest(nwbfield=nwbfield, container_type=type1.__name__): + f1 = getattr(container1, nwbfield) + f2 = getattr(container2, nwbfield) + if isinstance(f1, h5py.Dataset): + f1 = f1[()] + if isinstance(f1, ZarrArray): + f1 = f1[:] + if isinstance(f1, (tuple, list, np.ndarray)): + if len(f1) > 0: + if isinstance(f1[0], NWBContainer): + for sub1, sub2 in zip(f1, f2): + self.assertContainerEqual(sub1, sub2) + elif isinstance(f1[0], NWBData): + for sub1, sub2 in zip(f1, f2): + self.assertDataEqual(sub1, sub2) + continue + else: + self.assertEqual(len(f1), len(f2)) + if len(f1) == 0: + continue + if isinstance(f1[0], float): + for v1, v2 in zip(f1, f2): + self.assertAlmostEqual(v1, v2, places=6) + else: + self.assertTrue(np.array_equal(f1, f2)) + elif isinstance(f1, dict) and len(f1) and isinstance(next(iter(f1.values())), NWBContainer): + f1_keys = set(f1.keys()) + f2_keys = set(f2.keys()) + self.assertSetEqual(f1_keys, f2_keys) + for k in f1_keys: + with self.subTest(module_name=k): + self.assertContainerEqual(f1[k], f2[k]) + elif isinstance(f1, NWBContainer): + self.assertContainerEqual(f1, f2) + elif isinstance(f1, NWBData) or isinstance(f2, NWBData): + if isinstance(f1, NWBData) and isinstance(f2, NWBData): + self.assertDataEqual(f1, f2) + elif isinstance(f1, NWBData): + self.assertTrue(np.array_equal(f1.data, f2)) + elif isinstance(f2, NWBData): + self.assertTrue(np.array_equal(f1.data, f2)) + else: + if isinstance(f1, (float, np.float32, np.float16)): + npt.assert_almost_equal(f1, f2) + else: + self.assertEqual(f1, f2) + + def assertDataEqual(self, data1, data2): + self.assertEqual(type(data1), type(data2)) + self.assertEqual(len(data1), len(data2)) + + +class TestMapRoundTrip(TestMapNWBContainer): + + _required_tests = ('test_roundtrip',) + run_injected_file_test = False + + def setUp(self): + super(TestMapRoundTrip, self).setUp() + self.container = self.setUpContainer() + self.object_id = self.container.object_id + self.start_time = datetime(1971, 1, 1, 12, tzinfo=tzutc()) + self.create_date = datetime(2018, 4, 15, 12, tzinfo=tzlocal()) + self.container_type = self.container.__class__.__name__ + test_case_name = str(self.id()).split(".")[-1] + if test_case_name == "test_zarr_roundtrip": + self.filename = 'test_zarrio_%s' % self.container_type + else: + self.filename = 'test_%s.nwb' % self.container_type + self.writer = None + self.reader = None + + def tearDown(self): + if self.writer is not None: + self.writer.close() + if self.reader is not None: + self.reader.close() + if os.path.exists(self.filename) and os.getenv("CLEAN_NWB", '1') not in ('0', 'false', 'FALSE', 'False'): + if os.path.isfile(self.filename): + os.remove(self.filename) + elif os.path.isdir(self.filename): + shutil.rmtree(self.filename) + + def roundtripContainer(self, cache_spec=False): + description = 'a file to test writing and reading a %s' % self.container_type + identifier = 'TEST_%s' % self.container_type + nwbfile = NWBFile(description, identifier, self.start_time, file_create_date=self.create_date) + self.addContainer(nwbfile) + + self.writer = HDF5IO(self.filename, manager=get_manager(), mode='w') + self.writer.write(nwbfile, cache_spec=cache_spec) + self.writer.close() + self.reader = HDF5IO(self.filename, manager=get_manager(), mode='r') + self.read_nwbfile = self.reader.read() + + try: + tmp = self.getContainer(self.read_nwbfile) + return tmp + except Exception as e: + self.reader.close() + self.reader = None + raise e + + def test_roundtrip(self): + self.read_container = self.roundtripContainer() + # make sure we get a completely new object + self.assertIsNotNone(str(self.container)) # added as a test to make sure printing works + self.assertIsNotNone(str(self.read_container)) + self.assertNotEqual(id(self.container), id(self.read_container)) + self.assertIs(self.read_nwbfile.objects[self.container.object_id], self.read_container) + self.assertContainerEqual(self.read_container, self.container) + self.validate() + + def validate(self): + # validate created file + if os.path.exists(self.filename): + with NWBHDF5IO(self.filename, mode='r') as io: + errors = pynwb_validate(io) + if errors: + for err in errors: + raise Exception(err) + + def roundtripContainerZarrIO(self, cache_spec=False): + description = 'a file to test writing and reading a %s' % self.container_type + identifier = 'TEST_%s' % self.container_type + nwbfile = NWBFile(description, identifier, self.start_time, file_create_date=self.create_date) + self.addContainer(nwbfile) + + self.writer = ZarrIO(self.filename, manager=get_manager(), mode='w') + self.writer.write(nwbfile, cache_spec=cache_spec) + self.writer.close() + self.reader = ZarrIO(self.filename, manager=get_manager(), mode='r') + self.read_nwbfile = self.reader.read() + + try: + tmp = self.getContainer(self.read_nwbfile) + return tmp + except Exception as e: + self.reader.close() + self.reader = None + raise e + + def test_zarr_roundtrip(self): + self.read_container = self.roundtripContainerZarrIO() + # make sure we get a completely new object + str(self.container) # added as a test to make sure printing works + self.assertNotEqual(id(self.container), id(self.read_container)) + self.assertContainerEqual(self.read_container, self.container) + self.validate_zarr() + + def validate_zarr(self): + # validate created file + if os.path.exists(self.filename): + with NWBZarrIO(self.filename, mode='r') as io: + # TODO need to update the validator to support Zarr. For now well just read the file instead + #errors = pynwb_validate(io) + #if errors: + # for err in errors: + # raise Exception(err) + temp = io.read() + + def addContainer(self, nwbfile): + ''' Should take an NWBFile object and add the container to it ''' + raise unittest.SkipTest('Cannot run test unless addContainer is implemented') + + def getContainer(self, nwbfile): + ''' Should take an NWBFile object and return the Container''' + raise unittest.SkipTest('Cannot run test unless getContainer is implemented') + + +class TestDataInterfaceIO(TestMapRoundTrip): + + def addContainer(self, nwbfile): + ''' Should take an NWBFile object and add the container to it ''' + nwbfile.add_acquisition(self.container) + + def getContainer(self, nwbfile): + ''' Should take an NWBFile object and return the Container''' + return nwbfile.get_acquisition(self.container.name)