NeurodataWithoutBorders · oruebel · Jul 18, 2019 · Jul 18, 2019 · Jul 18, 2019 · Jul 18, 2019
diff --git a/docs/notebooks/zarr_file_conversion_test.ipynb b/docs/notebooks/zarr_file_conversion_test.ipynb
@@ -0,0 +1,291 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b03b7582",
+   "metadata": {},
+   "source": [
+    "# Installing the Zarr Backend for NWB"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6bd6b4c2",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "conda create -n nwbzarr python=3.9\n",
+    "conda activate nwbzarr\n",
+    "conda install Cython\n",
+    "conda install numpy==1.21.0\n",
+    "conda install pkgconfig\n",
+    "conda install h5py=3.3.0\n",
+    "conda install pandas==1.3.0\n",
+    "conda install python-dateutil==2.8.1\n",
+    "git clone --recurse-submodules https://github.com/NeurodataWithoutBorders/pynwb.git\n",
+    "cd pynwb\n",
+    "git checkout add/zarrio\n",
+    "pip install -e .\n",
+    "pip uninstall hdmf\n",
+    "git clone --recurse-submodules https://github.com/hdmf-dev/hdmf.git\n",
+    "cd hdmf\n",
+    "git checkout 1.0.3-zarr\n",
+    "conda install --file requirements.txt\n",
+    "pip install -e .\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96a1c56c",
+   "metadata": {},
+   "source": [
+    "To use this notebook, the following optional package should also be installed\n",
+    "\n",
+    "```\n",
+    "pip install dandi\n",
+    "conda install jupyter\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "41b2187e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import shutil"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5841dfd",
+   "metadata": {},
+   "source": [
+    "# Download a file from DANDI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ac7df3b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dandi.dandiapi import DandiAPIClient\n",
+    "\n",
+    "dandiset_id = '000207'  \n",
+    "filepath = \"sub-1/sub-1_ses-1_ecephys+image.nwb\"  # 5 MB file\n",
+    "with DandiAPIClient() as client:\n",
+    "    asset = client.get_dandiset(dandiset_id, 'draft').get_asset_by_path(filepath)\n",
+    "    s3_path = asset.get_content_url(follow_redirects=1, strip_query=True)\n",
+    "    filename = os.path.basename(asset.path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7d2b53df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "asset.download(filename)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b37360b",
+   "metadata": {},
+   "source": [
+    "# Define output settings and clean up old files "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "854dc0ae",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Removing test_hdf5_sub-1_ses-1_ecephys+image.nwb\n",
+      "Removing test_zarr_sub-1_ses-1_ecephys+image.nwb\n"
+     ]
+    }
+   ],
+   "source": [
+    "zarr_filename = \"test_zarr_\" + filename\n",
+    "hdf_filename = \"test_hdf5_\" + filename\n",
+    "\n",
+    "# Delete our converted HDF5 file from previous runs of this notebook\n",
+    "if os.path.exists(hdf_filename):\n",
+    "    print(\"Removing %s\" % hdf_filename)\n",
+    "    os.remove(hdf_filename)\n",
+    "# Delete our converted Zarr file from previous runs of this notebook\n",
+    "if os.path.exists(zarr_filename):\n",
+    "    print(\"Removing %s\" % zarr_filename)\n",
+    "    shutil.rmtree(zarr_filename)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "240691c5",
+   "metadata": {},
+   "source": [
+    "# Convert the file to Zarr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "981df005",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/oruebel/Devel/nwb/zarr/hdmf/src/hdmf/backends/zarr/zarr_tools.py:78: UserWarning: \u001b[91mThe ZarrIO backend is experimental. It is under active development. The ZarrIO backend may change or be removed at any time and backward compatibility is not guaranteed.\u001b[0m\n",
+      "  warnings.warn(warn_msg)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pynwb import NWBHDF5IO, NWBZarrIO\n",
+    "\n",
+    "with NWBHDF5IO(filename , 'r', load_namespaces=False) as read_io:\n",
+    "    with NWBZarrIO(zarr_filename, mode='w', chunking=True) as export_io:\n",
+    "        export_io.export(src_io=read_io, write_args=dict(link_data=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd15edcc",
+   "metadata": {},
+   "source": [
+    "# Read the Zarr file back in"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "2d8aa004",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/oruebel/Devel/nwb/zarr/pynwb/src/pynwb/base.py:167: UserWarning: Length of data does not match length of timestamps. Your data may be transposed. Time should be on the 0th dimension\n",
+      "  warn(\"Length of data does not match length of timestamps. Your data may be transposed. Time should be on \"\n"
+     ]
+    }
+   ],
+   "source": [
+    "zr = NWBZarrIO(zarr_filename, 'r')\n",
+    "zf = zr.read()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5b97c8f",
+   "metadata": {},
+   "source": [
+    "# Convert the Zarr file back to HDF5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c74a470e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with NWBZarrIO(zarr_filename, mode='r') as read_io:\n",
+    "     with NWBHDF5IO(hdf_filename , 'w') as export_io:\n",
+    "         export_io.export(src_io=read_io, write_args=dict(link_data=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f1cc2427",
+   "metadata": {},
+   "source": [
+    "# Read the new HDF5 file back\n",
+    "\n",
+    "Now our file has been converted from HDF5 to Zarr and back again to HDF5. Here we check that we can stil read that file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "51f008f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with NWBHDF5IO(hdf_filename , 'r') as hr:\n",
+    "    hf = hr.read()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6a56195",
+   "metadata": {},
+   "source": [
+    "# Notes\n",
+    "\n",
+    "The ZarrIO backend for NWB is under development as part of the following PRs on GitHub:\n",
+    "\n",
+    "* **HDMF**: https://github.com/hdmf-dev/hdmf/pull/696\n",
+    "\n",
+    "   * Related PR: https://github.com/hdmf-dev/hdmf/pull/697 This PR includes all of the general changes to HDMF that we did to implement the Zarr backend. Once #697 is merged #696 should be agains synced with dev, so that the PR then only includes the changes to add Zarr itself.\n",
+    "   \n",
+    "* **PyNWB**: https://github.com/NeurodataWithoutBorders/pynwb/pull/1018"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6bbed78",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ffda2b3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/pynwb/__init__.py b/src/pynwb/__init__.py
@@ -222,8 +222,7 @@ def __init__(self, **kwargs):
                 warn("loading namespaces from file - ignoring 'manager'")
             if extensions is not None:
                 warn("loading namespaces from file - ignoring 'extensions' argument")
-            # namespaces are not loaded when creating an NWBHDF5IO object in write mode
-            if 'w' in mode or mode == 'x':
+            if 'w' in mode or mode == 'x':  # namespaces are not loaded in write mode
                 raise ValueError("cannot load namespaces from file when writing to it")
 
             tm = get_type_map()
@@ -257,6 +256,73 @@ def export(self, **kwargs):
         call_docval_func(super().export, kwargs)
 
 
+try:
+    from hdmf.backends.zarr.zarr_tools import ZarrIO as _ZarrIO
+    import zarr
+
+    class NWBZarrIO(_ZarrIO):
+
+        @docval({'name': 'path', 'type': str, 'doc': 'the path to the Zarr file'},
+                {'name': 'mode', 'type': str,
+                 'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "w-")'},
+                {'name': 'load_namespaces', 'type': bool,
+                 'doc': 'whether or not to load cached namespaces from given path - not applicable in write mode',
+                 'default': False},
+                {'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O',
+                 'default': None},
+                {'name': 'extensions', 'type': (str, TypeMap, list),
+                 'doc': 'a path to a namespace, a TypeMap, or a list consisting paths  to namespaces and TypeMaps',
+                 'default': None},
+                {'name': 'synchronizer', 'type': (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool),
+                 'doc': 'Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.',
+                 'default': None},
+                {'name': 'chunking', 'type': bool, 'doc': "Enable/Disable chunking of datasets by default",
+                 'default': True})
+        def __init__(self, **kwargs):
+            path, mode, manager, extensions, load_namespaces, synchronizer, chunking = \
+                popargs('path', 'mode', 'manager', 'extensions',
+                        'load_namespaces', 'synchronizer', 'chunking', kwargs)
+            if load_namespaces:
+                if manager is not None:
+                    warn("loading namespaces from file - ignoring 'manager'")
+                if extensions is not None:
+                    warn("loading namespaces from file - ignoring 'extensions' argument")
+                # namespaces are not loaded when creating an NWBZarrIO object in write mode
+                if 'w' in mode or mode == 'x':
+                    raise ValueError("cannot load namespaces from file when writing to it")
+
+                tm = get_type_map()
+                super(NWBZarrIO, self).load_namespaces(tm, path)
+                manager = BuildManager(tm)
+            else:
+                if manager is not None and extensions is not None:
+                    raise ValueError("'manager' and 'extensions' cannot be specified together")
+                elif extensions is not None:
+                    manager = get_manager(extensions=extensions)
+                elif manager is None:
+                    manager = get_manager()
+            super(NWBZarrIO, self).__init__(path,
+                                            manager=manager,
+                                            mode=mode,
+                                            synchronizer=synchronizer,
+                                            chunking=chunking)
+
+        @docval({'name': 'src_io', 'type': HDMFIO, 'doc': 'the HDMFIO object for reading the data to export'},
+                {'name': 'nwbfile', 'type': 'NWBFile',
+                 'doc': 'the NWBFile object to export. If None, then the entire contents of src_io will be exported',
+                 'default': None},
+                {'name': 'write_args', 'type': dict, 'doc': 'arguments to pass to :py:meth:`write_builder`',
+                 'default': dict()})
+        def export(self, **kwargs):
+            nwbfile = popargs('nwbfile', kwargs)
+            kwargs['container'] = nwbfile
+            call_docval_func(super().export, kwargs)
+
+except ImportError:
+    from warnings import info
+    info("Ignoring optional NWBZarrIO. Zarr is not installed")
+
+
 from . import io as __io  # noqa: F401,E402
 from .core import NWBContainer, NWBData  # noqa: F401,E402
 from .base import TimeSeries, ProcessingModule  # noqa: F401,E402