zarr-developers · d-v-b · Sep 26, 2024 · Aug 3, 2024 · Aug 3, 2024 · Aug 2, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -220,4 +220,5 @@ filterwarnings = [
     "error:::zarr.*",
     "ignore:PY_SSIZE_T_CLEAN will be required.*:DeprecationWarning",
     "ignore:The loop argument is deprecated since Python 3.8.*:DeprecationWarning",
+    "ignore:.*is transitional and will be removed.*:DeprecationWarning",
 ]
diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, Mapping
 from typing import Any, NamedTuple, Protocol, runtime_checkable
 
 from typing_extensions import Self
@@ -221,6 +221,14 @@ def close(self) -> None:
         self._is_open = False
         pass
 
+    async def _set_dict(self, dict: Mapping[str, Buffer]) -> None:
+        """
+        Insert objects into storage as defined by a prefix: value mapping.
+        """
+        for key, value in dict.items():
+            await self.set(key, value)
+        return None
+
 
 @runtime_checkable
 class ByteGetter(Protocol):

diff --git a/src/zarr/array.py b/src/zarr/array.py
@@ -10,12 +10,13 @@
 # Questions to consider:
 # 1. Was splitting the array into two classes really necessary?
 from asyncio import gather
-from collections.abc import Iterable
+from collections.abc import Iterable, Iterator
 from dataclasses import dataclass, field, replace
 from typing import Any, Literal, cast
 
 import numpy as np
 import numpy.typing as npt
+from typing_extensions import deprecated
 
 from zarr.abc.codec import Codec, CodecPipeline
 from zarr.abc.store import set_or_delete
@@ -52,11 +53,13 @@
     OrthogonalSelection,
     Selection,
     VIndex,
+    ceildiv,
     check_fields,
     check_no_multi_fields,
     is_pure_fancy_indexing,
     is_pure_orthogonal_indexing,
     is_scalar,
+    iter_grid,
     pop_fields,
 )
 from zarr.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
@@ -65,7 +68,7 @@
 from zarr.store.core import (
     ensure_no_existing_node,
 )
-from zarr.sync import sync
+from zarr.sync import collect_aiterator, sync
 
 
 def parse_array_metadata(data: Any) -> ArrayV2Metadata | ArrayV3Metadata:
@@ -393,10 +396,12 @@ def shape(self) -> ChunkCoords:
     def chunks(self) -> ChunkCoords:
         if isinstance(self.metadata.chunk_grid, RegularChunkGrid):
             return self.metadata.chunk_grid.chunk_shape
-        else:
-            raise ValueError(
-                f"chunk attribute is only available for RegularChunkGrid, this array has a {self.metadata.chunk_grid}"
-            )
+
+        msg = (
+            f"The `chunks` attribute is only defined for arrays using `RegularChunkGrid`."
+            f"This array has a {self.metadata.chunk_grid} instead."
+        )
+        raise NotImplementedError(msg)
 
     @property
     def size(self) -> int:
@@ -437,6 +442,59 @@ def basename(self) -> str | None:
             return self.name.split("/")[-1]
         return None
 
+    @property
+    @deprecated(
+        "cdata_shape is transitional and will be removed in an early zarr-python v3 release."
+    )
+    def cdata_shape(self) -> ChunkCoords:
+        """
+        The shape of the chunk grid for this array.
+        """
+        return tuple(ceildiv(s, c) for s, c in zip(self.shape, self.chunks, strict=False))
+
+    @property
+    @deprecated("nchunks is transitional and will be removed in an early zarr-python v3 release.")
+    def nchunks(self) -> int:
+        """
+        The number of chunks in the stored representation of this array.
+        """
+        return product(self.cdata_shape)
+
+    @property
+    def _iter_chunk_coords(self) -> Iterator[ChunkCoords]:
+        """
+        Produce an iterator over the coordinates of each chunk, in chunk grid space.
+        """
+        return iter_grid(self.cdata_shape)
+
+    @property
+    def _iter_chunk_keys(self) -> Iterator[str]:
+        """
+        Return an iterator over the keys of each chunk.
+        """
+        for k in self._iter_chunk_coords:
+            yield self.metadata.encode_chunk_key(k)
+
+    @property
+    def _iter_chunk_regions(self) -> Iterator[tuple[slice, ...]]:
+        """
+        Iterate over the regions spanned by each chunk.
+        """
+        for cgrid_position in self._iter_chunk_coords:
+            out: tuple[slice, ...] = ()
+            for c_pos, c_shape in zip(cgrid_position, self.chunks, strict=False):
+                start = c_pos * c_shape
+                stop = start + c_shape
+                out += (slice(start, stop, 1),)
+            yield out
+
+    @property
+    def nbytes(self) -> int:
+        """
+        The number of bytes that can be stored in this array.
+        """
+        return self.nchunks * self.dtype.itemsize
+
     async def _get_selection(
         self,
         indexer: Indexer,
@@ -735,6 +793,52 @@ def read_only(self) -> bool:
     def fill_value(self) -> Any:
         return self.metadata.fill_value
 
+    @property
+    @deprecated(
+        "cdata_shape is transitional and will be removed in an early zarr-python v3 release."
+    )
+    def cdata_shape(self) -> ChunkCoords:
+        """
+        The shape of the chunk grid for this array.
+        """
+        return tuple(ceildiv(s, c) for s, c in zip(self.shape, self.chunks, strict=False))
+
+    @property
+    @deprecated("nchunks is transitional and will be removed in an early zarr-python v3 release.")
+    def nchunks(self) -> int:
+        """
+        The number of chunks in the stored representation of this array.
+        """
+        return self._async_array.nchunks
+
+    @property
+    def _iter_chunks(self) -> Iterator[ChunkCoords]:
+        """
+        Produce an iterator over the coordinates of each chunk, in chunk grid space.
+        """
+        yield from self._async_array._iter_chunk_coords
+
+    @property
+    def nbytes(self) -> int:
+        """
+        The number of bytes that can be stored in this array.
+        """
+        return self._async_array.nbytes
+
+    @property
+    def _iter_chunk_keys(self) -> Iterator[str]:
+        """
+        Return an iterator over the keys of each chunk.
+        """
+        yield from self._async_array._iter_chunk_keys
+
+    @property
+    def _iter_chunk_regions(self) -> Iterator[tuple[slice, ...]]:
+        """
+        Iterate over the regions spanned by each chunk.
+        """
+        yield from self._async_array._iter_chunk_regions
+
     def __array__(
         self, dtype: npt.DTypeLike | None = None, copy: bool | None = None
     ) -> NDArrayLike:
@@ -2056,3 +2160,27 @@ def info(self) -> None:
         return sync(
             self._async_array.info(),
         )
+
+
+@deprecated(
+    "nchunks_initialized is transitional and will be removed in an early zarr-python v3 release."
+)
+def nchunks_initialized(array: Array) -> int:
+    return len(chunks_initialized(array))
+
+
+def chunks_initialized(array: Array) -> tuple[str, ...]:
+    """
+    Return the keys of all the chunks that exist in storage.
+    """
+    # todo: make this compose with the underlying async iterator
+    store_contents = list(
+        collect_aiterator(array.store_path.store.list_prefix(prefix=array.store_path.path))
+    )
+    out: list[str] = []
+
+    for chunk_key in array._iter_chunk_keys:
+        if chunk_key in store_contents:
+            out.append(chunk_key)
+
+    return tuple(out)
diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py
@@ -4,14 +4,13 @@
 import math
 import numbers
 import operator
-from collections.abc import Iterator, Sequence
+from collections.abc import Iterable, Iterator, Sequence
 from dataclasses import dataclass
 from enum import Enum
 from functools import reduce
 from types import EllipsisType
 from typing import (
     TYPE_CHECKING,
-    Any,
     NamedTuple,
     Protocol,
     TypeGuard,
@@ -27,6 +26,8 @@
 from zarr.common import ChunkCoords, product
 
 if TYPE_CHECKING:
+    from typing import Any
+
     from zarr.array import Array
     from zarr.chunk_grids import ChunkGrid
 
@@ -86,6 +87,29 @@ def ceildiv(a: float, b: float) -> int:
     return math.ceil(a / b)
 
 
+def iter_grid(shape: Iterable[int]) -> Iterator[ChunkCoords]:
+    """
+    Iterate over the elements of grid.
+
+    Takes a grid shape expressed as an iterable of ints and
+    yields tuples bounded by that grid shape in lexicographic order.
+
+    Examples
+    --------
+    >>> tuple(iter_grid((1,)))
+    ((0,),)
+
+    >>> tuple(iter_grid((2,3)))
+    ((0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2))
+
+    Parameters
+    ----------
+    shape: Iterable[int]
+        The shape of the grid to iterate over.
+    """
+    yield from itertools.product(*(map(range, shape)))
+
+
 def is_integer(x: Any) -> TypeGuard[int]:
     """True if x is an integer (both pure Python or NumPy)."""
     return isinstance(x, numbers.Integral) and not is_bool(x)

diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py
@@ -141,6 +141,10 @@ def get_chunk_spec(
     def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
         pass
 
+    @abstractmethod
+    def decode_chunk_key(self, key: str) -> ChunkCoords:
+        pass
+
     @abstractmethod
     def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
         pass
@@ -252,6 +256,9 @@ def get_chunk_spec(
     def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
         return self.chunk_key_encoding.encode_chunk_key(chunk_coords)
 
+    def decode_chunk_key(self, key: str) -> ChunkCoords:
+        return self.chunk_key_encoding.decode_chunk_key(key)
+
     def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
         def _json_convert(o: Any) -> Any:
             if isinstance(o, np.dtype):
@@ -445,6 +452,9 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
         chunk_identifier = self.dimension_separator.join(map(str, chunk_coords))
         return "0" if chunk_identifier == "" else chunk_identifier
 
+    def decode_chunk_key(self, key: str) -> ChunkCoords:
+        return tuple(map(int, key.split(self.dimension_separator)))
+
     def update_shape(self, shape: ChunkCoords) -> Self:
         return replace(self, shape=shape)
 

diff --git a/src/zarr/store/local.py b/src/zarr/store/local.py
@@ -193,14 +193,10 @@ async def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]:
         -------
         AsyncGenerator[str, None]
         """
-        for p in (self.root / prefix).rglob("*"):
-            if p.is_file():
-                yield str(p)
-
         to_strip = str(self.root) + "/"
         for p in (self.root / prefix).rglob("*"):
             if p.is_file():
-                yield str(p).replace(to_strip, "")
+                yield str(p).removeprefix(to_strip)
 
     async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
         """

diff --git a/src/zarr/store/memory.py b/src/zarr/store/memory.py
@@ -101,7 +101,7 @@ async def list(self) -> AsyncGenerator[str, None]:
     async def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]:
         for key in self._store_dict:
             if key.startswith(prefix):
-                yield key
+                yield key.removeprefix(prefix)
 
     async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
         if prefix.endswith("/"):

diff --git a/src/zarr/store/remote.py b/src/zarr/store/remote.py
@@ -205,5 +205,6 @@ async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
             yield onefile
 
     async def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]:
-        for onefile in await self._fs._ls(prefix, detail=False):
-            yield onefile
+        find_str = "/".join([self.path, prefix])
+        for onefile in await self._fs._find(find_str):
+            yield onefile.removeprefix(find_str)
diff --git a/src/zarr/sync.py b/src/zarr/sync.py
@@ -114,6 +114,23 @@ def _get_loop() -> asyncio.AbstractEventLoop:
     return loop[0]
 
 
+async def _collect_aiterator(data: AsyncIterator[T]) -> tuple[T, ...]:
+    """
+    Collect an entire async iterator into a tuple
+    """
+    result = []
+    async for x in data:
+        result.append(x)
+    return tuple(result)
+
+
+def collect_aiterator(data: AsyncIterator[T]) -> tuple[T, ...]:
+    """
+    Synchronously collect an entire async iterator into a tuple.
+    """
+    return sync(_collect_aiterator(data))
+
+
 class SyncMixin:
     def _sync(self, coroutine: Coroutine[Any, Any, T]) -> T:
         # TODO: refactor this to to take *args and **kwargs and pass those to the method