From e642337c38a14c4430d8ca1dd912830358a4d9b2 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 30 Apr 2024 09:53:11 +0200 Subject: [PATCH 01/45] use Buffer --- src/zarr/v3/__init__.py | 1 + src/zarr/v3/abc/codec.py | 15 ++++---- src/zarr/v3/abc/store.py | 13 ++++--- src/zarr/v3/array.py | 11 +++--- src/zarr/v3/array_v2.py | 19 +++++---- src/zarr/v3/buffer.py | 70 ++++++++++++++++++++++++++++++++++ src/zarr/v3/codecs/blosc.py | 18 ++++----- src/zarr/v3/codecs/bytes.py | 13 ++++--- src/zarr/v3/codecs/crc32c_.py | 21 +++++----- src/zarr/v3/codecs/gzip.py | 15 ++++---- src/zarr/v3/codecs/pipeline.py | 8 ++-- src/zarr/v3/codecs/sharding.py | 55 +++++++++++++------------- src/zarr/v3/codecs/zstd.py | 15 ++++---- src/zarr/v3/group.py | 33 ++++++++++------ src/zarr/v3/metadata.py | 5 ++- src/zarr/v3/store/core.py | 8 ++-- src/zarr/v3/store/local.py | 23 +++++------ src/zarr/v3/store/memory.py | 18 ++++----- src/zarr/v3/store/remote.py | 6 +-- tests/v3/test_codecs.py | 33 ++++++++-------- 20 files changed, 249 insertions(+), 151 deletions(-) create mode 100644 src/zarr/v3/buffer.py diff --git a/src/zarr/v3/__init__.py b/src/zarr/v3/__init__.py index 3441fa67be..c046cc01f0 100644 --- a/src/zarr/v3/__init__.py +++ b/src/zarr/v3/__init__.py @@ -2,6 +2,7 @@ from typing import Union + import zarr.v3.codecs # noqa: F401 from zarr.v3.array import Array, AsyncArray # noqa: F401 from zarr.v3.array_v2 import ArrayV2 diff --git a/src/zarr/v3/abc/codec.py b/src/zarr/v3/abc/codec.py index d0e51ff894..796f321465 100644 --- a/src/zarr/v3/abc/codec.py +++ b/src/zarr/v3/abc/codec.py @@ -7,12 +7,13 @@ from zarr.v3.abc.metadata import Metadata from zarr.v3.common import ArraySpec +from zarr.v3.buffer import Buffer from zarr.v3.store import StorePath if TYPE_CHECKING: from typing_extensions import Self - from zarr.v3.common import BytesLike, SliceSelection + from zarr.v3.common import SliceSelection from zarr.v3.metadata import ArrayMetadata from zarr.v3.config import RuntimeConfiguration @@ -58,7 +59,7 @@ class ArrayBytesCodec(Codec): @abstractmethod async def decode( self, - chunk_array: BytesLike, + chunk_array: Buffer, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> np.ndarray: @@ -70,7 +71,7 @@ async def encode( chunk_array: np.ndarray, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: + ) -> Optional[Buffer]: pass @@ -103,17 +104,17 @@ class BytesBytesCodec(Codec): @abstractmethod async def decode( self, - chunk_array: BytesLike, + chunk_array: Buffer, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> BytesLike: + ) -> Buffer: pass @abstractmethod async def encode( self, - chunk_array: BytesLike, + chunk_array: Buffer, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: + ) -> Optional[Buffer]: pass diff --git a/src/zarr/v3/abc/store.py b/src/zarr/v3/abc/store.py index ce5de279c4..7f9698f88b 100644 --- a/src/zarr/v3/abc/store.py +++ b/src/zarr/v3/abc/store.py @@ -2,12 +2,14 @@ from typing import List, Tuple, Optional +from zarr.v3.buffer import Buffer + class Store(ABC): @abstractmethod async def get( self, key: str, byte_range: Optional[Tuple[int, Optional[int]]] = None - ) -> Optional[bytes]: + ) -> Optional[Buffer]: """Retrieve the value associated with a given key. Parameters @@ -17,14 +19,14 @@ async def get( Returns ------- - bytes + Buffer """ ... @abstractmethod async def get_partial_values( self, key_ranges: List[Tuple[str, Tuple[int, int]]] - ) -> List[bytes]: + ) -> List[Buffer]: """Retrieve possibly partial values from given key_ranges. Parameters @@ -34,8 +36,7 @@ async def get_partial_values( Returns ------- - list[bytes] - list of values, in the order of the key_ranges, may contain null/none for missing keys + list of values, in the order of the key_ranges, may contain null/none for missing keys """ ... @@ -60,7 +61,7 @@ def supports_writes(self) -> bool: ... @abstractmethod - async def set(self, key: str, value: bytes) -> None: + async def set(self, key: str, value: Buffer) -> None: """Store a (key, value) pair. Parameters diff --git a/src/zarr/v3/array.py b/src/zarr/v3/array.py index c0a00a624e..54f0d03a95 100644 --- a/src/zarr/v3/array.py +++ b/src/zarr/v3/array.py @@ -35,6 +35,7 @@ from zarr.v3.chunk_grids import RegularChunkGrid from zarr.v3.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.v3.metadata import ArrayMetadata +from zarr.v3.buffer import as_buffer from zarr.v3.store import StoreLike, StorePath, make_store_path from zarr.v3.sync import sync @@ -150,7 +151,7 @@ async def open( assert zarr_json_bytes is not None return cls.from_dict( store_path, - json.loads(zarr_json_bytes), + json.loads(zarr_json_bytes.as_bytearray()), runtime_configuration=runtime_configuration, ) @@ -165,7 +166,7 @@ async def open_auto( if v3_metadata_bytes is not None: return cls.from_dict( store_path, - json.loads(v3_metadata_bytes), + json.loads(v3_metadata_bytes.as_bytearray()), runtime_configuration=runtime_configuration or RuntimeConfiguration(), ) else: @@ -223,7 +224,7 @@ async def getitem(self, selection: Selection): return out[()] async def _save_metadata(self) -> None: - await (self.store_path / ZARR_JSON).set(self.metadata.to_bytes()) + await (self.store_path / ZARR_JSON).set(as_buffer(self.metadata.to_bytes())) async def _read_chunk( self, @@ -392,14 +393,14 @@ async def _delete_key(key: str) -> None: ) # Write new metadata - await (self.store_path / ZARR_JSON).set(new_metadata.to_bytes()) + await (self.store_path / ZARR_JSON).set(as_buffer(new_metadata)) return replace(self, metadata=new_metadata) async def update_attributes(self, new_attributes: Dict[str, Any]) -> AsyncArray: new_metadata = replace(self.metadata, attributes=new_attributes) # Write new metadata - await (self.store_path / ZARR_JSON).set(new_metadata.to_bytes()) + await (self.store_path / ZARR_JSON).set(as_buffer(new_metadata)) return replace(self, metadata=new_metadata) def __repr__(self): diff --git a/src/zarr/v3/array_v2.py b/src/zarr/v3/array_v2.py index f150d2dbd2..4f6cbece8c 100644 --- a/src/zarr/v3/array_v2.py +++ b/src/zarr/v3/array_v2.py @@ -23,6 +23,7 @@ from zarr.v3.config import RuntimeConfiguration from zarr.v3.indexing import BasicIndexer, all_chunk_coords, is_total_slice from zarr.v3.metadata import ArrayV2Metadata +from zarr.v3.buffer import as_buffer, as_bytearray from zarr.v3.store import StoreLike, StorePath, make_store_path from zarr.v3.sync import sync @@ -152,8 +153,10 @@ async def open_async( assert zarray_bytes is not None return cls.from_dict( store_path, - zarray_json=json.loads(zarray_bytes), - zattrs_json=json.loads(zattrs_bytes) if zattrs_bytes is not None else None, + zarray_json=json.loads(zarray_bytes.as_bytearray()), + zattrs_json=json.loads(zattrs_bytes.as_bytearray()) + if zattrs_bytes is not None + else None, runtime_configuration=runtime_configuration, ) @@ -192,7 +195,7 @@ async def _save_metadata(self) -> None: await (self.store_path / ZARRAY_JSON).set(self.metadata.to_bytes()) if self.attributes is not None and len(self.attributes) > 0: await (self.store_path / ZATTRS_JSON).set( - json.dumps(self.attributes).encode(), + as_buffer(json.dumps(self.attributes).encode()), ) else: await (self.store_path / ZATTRS_JSON).delete() @@ -258,7 +261,7 @@ async def _read_chunk( ): store_path = self.store_path / self._encode_chunk_key(chunk_coords) - chunk_array = await self._decode_chunk(await store_path.get()) + chunk_array = await self._decode_chunk(as_bytearray(await store_path.get())) if chunk_array is not None: tmp = chunk_array[chunk_selection] out[out_selection] = tmp @@ -359,7 +362,7 @@ async def _write_chunk( else: # writing partial chunks # read chunk first - tmp = await self._decode_chunk(await store_path.get()) + tmp = await self._decode_chunk(as_bytearray(await store_path.get())) # merge new value if tmp is None: @@ -387,7 +390,7 @@ async def _write_chunk_to_store(self, store_path: StorePath, chunk_array: np.nda if chunk_bytes is None: await store_path.delete() else: - await store_path.set(chunk_bytes) + await store_path.set(as_buffer(chunk_bytes)) async def _encode_chunk(self, chunk_array: np.ndarray) -> Optional[BytesLike]: chunk_array = chunk_array.ravel(order=self.metadata.order) @@ -506,7 +509,7 @@ async def convert_to_v3_async(self) -> Array: ) new_metadata_bytes = new_metadata.to_bytes() - await (self.store_path / ZARR_JSON).set(new_metadata_bytes) + await (self.store_path / ZARR_JSON).set(as_buffer(new_metadata_bytes)) return Array.from_dict( store_path=self.store_path, @@ -515,7 +518,7 @@ async def convert_to_v3_async(self) -> Array: ) async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> ArrayV2: - await (self.store_path / ZATTRS_JSON).set(json.dumps(new_attributes).encode()) + await (self.store_path / ZATTRS_JSON).set(as_buffer(json.dumps(new_attributes).encode())) return replace(self, attributes=new_attributes) def update_attributes(self, new_attributes: Dict[str, Any]) -> ArrayV2: diff --git a/src/zarr/v3/buffer.py b/src/zarr/v3/buffer.py new file mode 100644 index 0000000000..669e9059c7 --- /dev/null +++ b/src/zarr/v3/buffer.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Optional +import numpy as np + + +if TYPE_CHECKING: + from typing_extensions import Self + + +class NDBuffer: + # TODO: replace np.ndarray with this n-dimensional buffer + pass + + +class Buffer(NDBuffer): + """Contiguous memory block + + We use `Buffer` throughout Zarr to represent a contiguous block of memory. + For now, we only support host memory but the plan is to support other types + of memory such as CUDA device memory. + """ + + def __init__(self, data: bytearray): + assert isinstance(data, bytearray) + self._data = data + + def as_bytearray(self) -> bytearray: + return self._data + + def as_numpy_array(self, dtype: np.DTypeLike) -> np.ndarray: + return np.frombuffer(self._data, dtype=dtype) + + def __getitem__(self, key) -> Self: + return self.__class__(self.as_bytearray().__getitem__(key)) + + def __setitem__(self, key, value) -> None: + self.as_bytearray().__setitem__(key, value) + + def __len__(self) -> int: + return len(self.as_bytearray()) + + def __add__(self, other: Buffer) -> Self: + return self.__class__(self.as_bytearray() + other.as_bytearray()) + + +def as_buffer(data: Any) -> Buffer: + if isinstance(data, Buffer): + return data + if isinstance(data, bytearray): + return Buffer(data) + if isinstance(data, bytes): + return Buffer(bytearray(data)) + if hasattr(data, "to_bytes"): + return as_buffer(data.to_bytes()) + return Buffer(bytearray(np.asarray(data))) + + +def as_bytes_wrapper(func, buf: Buffer) -> Buffer: + return as_buffer(func(buf.as_bytearray())) + + +def return_as_bytes_wrapper(func, *arg, **kwargs) -> Buffer: + return as_buffer(func(*arg, **kwargs)) + + +def as_bytearray(data: Optional[Buffer]): + if data is None: + return data + return data.as_bytearray() diff --git a/src/zarr/v3/codecs/blosc.py b/src/zarr/v3/codecs/blosc.py index 479865241f..f59e08ac17 100644 --- a/src/zarr/v3/codecs/blosc.py +++ b/src/zarr/v3/codecs/blosc.py @@ -6,17 +6,17 @@ from typing import TYPE_CHECKING, Union import numcodecs -import numpy as np from numcodecs.blosc import Blosc from zarr.v3.abc.codec import BytesBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import parse_enum, parse_named_configuration, to_thread +from zarr.v3.buffer import Buffer, as_bytes_wrapper, return_as_bytes_wrapper if TYPE_CHECKING: from typing import Dict, Optional from typing_extensions import Self - from zarr.v3.common import JSON, ArraySpec, BytesLike + from zarr.v3.common import JSON, ArraySpec from zarr.v3.config import RuntimeConfiguration @@ -161,20 +161,20 @@ def _blosc_codec(self) -> Blosc: async def decode( self, - chunk_bytes: bytes, + chunk_bytes: Buffer, _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> BytesLike: - return await to_thread(self._blosc_codec.decode, chunk_bytes) + ) -> Buffer: + return await to_thread(as_bytes_wrapper, self._blosc_codec.decode, chunk_bytes) async def encode( self, - chunk_bytes: bytes, + chunk_bytes: Buffer, chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: - chunk_array = np.frombuffer(chunk_bytes, dtype=chunk_spec.dtype) - return await to_thread(self._blosc_codec.encode, chunk_array) + ) -> Optional[Buffer]: + chunk_array = chunk_bytes.as_numpy_array(chunk_spec.dtype) + return await to_thread(return_as_bytes_wrapper, self._blosc_codec.encode, chunk_array) def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError diff --git a/src/zarr/v3/codecs/bytes.py b/src/zarr/v3/codecs/bytes.py index f92fe5606d..6ae9e7ddda 100644 --- a/src/zarr/v3/codecs/bytes.py +++ b/src/zarr/v3/codecs/bytes.py @@ -10,9 +10,10 @@ from zarr.v3.abc.codec import ArrayBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import parse_enum, parse_named_configuration +from zarr.v3.buffer import Buffer, as_buffer if TYPE_CHECKING: - from zarr.v3.common import JSON, ArraySpec, BytesLike + from zarr.v3.common import JSON, ArraySpec from zarr.v3.config import RuntimeConfiguration from typing_extensions import Self @@ -70,10 +71,11 @@ def _get_byteorder(self, array: np.ndarray) -> Endian: async def decode( self, - chunk_bytes: BytesLike, + chunk_bytes: Buffer, chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, ) -> np.ndarray: + assert isinstance(chunk_bytes, Buffer) if chunk_spec.dtype.itemsize > 0: if self.endian == Endian.little: prefix = "<" @@ -82,8 +84,7 @@ async def decode( dtype = np.dtype(f"{prefix}{chunk_spec.dtype.str[1:]}") else: dtype = np.dtype(f"|{chunk_spec.dtype.str[1:]}") - print(dtype) - chunk_array = np.frombuffer(chunk_bytes, dtype) + chunk_array = chunk_bytes.as_numpy_array(dtype) # ensure correct chunk shape if chunk_array.shape != chunk_spec.shape: @@ -97,13 +98,13 @@ async def encode( chunk_array: np.ndarray, _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: + ) -> Optional[Buffer]: if chunk_array.dtype.itemsize > 1: byteorder = self._get_byteorder(chunk_array) if self.endian is not None and self.endian != byteorder: new_dtype = chunk_array.dtype.newbyteorder(self.endian.name) chunk_array = chunk_array.astype(new_dtype) - return chunk_array.tobytes() + return as_buffer(chunk_array) def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length diff --git a/src/zarr/v3/codecs/crc32c_.py b/src/zarr/v3/codecs/crc32c_.py index 555bdeae3b..5677018f5e 100644 --- a/src/zarr/v3/codecs/crc32c_.py +++ b/src/zarr/v3/codecs/crc32c_.py @@ -10,11 +10,12 @@ from zarr.v3.abc.codec import BytesBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import parse_named_configuration +from zarr.v3.buffer import Buffer if TYPE_CHECKING: from typing import Dict, Optional from typing_extensions import Self - from zarr.v3.common import JSON, BytesLike, ArraySpec + from zarr.v3.common import JSON, ArraySpec from zarr.v3.config import RuntimeConfiguration @@ -32,12 +33,13 @@ def to_dict(self) -> Dict[str, JSON]: async def decode( self, - chunk_bytes: bytes, + chunk_bytes: Buffer, _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> BytesLike: - crc32_bytes = chunk_bytes[-4:] - inner_bytes = chunk_bytes[:-4] + ) -> Buffer: + data = chunk_bytes.as_bytearray() + crc32_bytes = data[-4:] + inner_bytes = data[:-4] computed_checksum = np.uint32(crc32c(inner_bytes)).tobytes() stored_checksum = bytes(crc32_bytes) @@ -46,15 +48,16 @@ async def decode( "Stored and computed checksum do not match. " + f"Stored: {stored_checksum!r}. Computed: {computed_checksum!r}." ) - return inner_bytes + return Buffer(inner_bytes) async def encode( self, - chunk_bytes: bytes, + chunk_bytes: Buffer, _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: - return chunk_bytes + np.uint32(crc32c(chunk_bytes)).tobytes() + ) -> Optional[Buffer]: + bytes = chunk_bytes.as_bytearray() + return Buffer(bytes + np.uint32(crc32c(bytes)).tobytes()) def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length + 4 diff --git a/src/zarr/v3/codecs/gzip.py b/src/zarr/v3/codecs/gzip.py index 478eee90c1..cc1b30b86c 100644 --- a/src/zarr/v3/codecs/gzip.py +++ b/src/zarr/v3/codecs/gzip.py @@ -7,11 +7,12 @@ from zarr.v3.abc.codec import BytesBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import parse_named_configuration, to_thread +from zarr.v3.buffer import Buffer, as_bytes_wrapper if TYPE_CHECKING: from typing import Optional, Dict from typing_extensions import Self - from zarr.v3.common import JSON, ArraySpec, BytesLike + from zarr.v3.common import JSON, ArraySpec from zarr.v3.config import RuntimeConfiguration @@ -46,19 +47,19 @@ def to_dict(self) -> Dict[str, JSON]: async def decode( self, - chunk_bytes: bytes, + chunk_bytes: Buffer, _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> BytesLike: - return await to_thread(GZip(self.level).decode, chunk_bytes) + ) -> Buffer: + return await to_thread(as_bytes_wrapper, GZip(self.level).decode, chunk_bytes) async def encode( self, - chunk_bytes: bytes, + chunk_bytes: Buffer, _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: - return await to_thread(GZip(self.level).encode, chunk_bytes) + ) -> Optional[Buffer]: + return await to_thread(as_bytes_wrapper, GZip(self.level).encode, chunk_bytes) def compute_encoded_size( self, diff --git a/src/zarr/v3/codecs/pipeline.py b/src/zarr/v3/codecs/pipeline.py index 7bb872eb79..91ba1926cc 100644 --- a/src/zarr/v3/codecs/pipeline.py +++ b/src/zarr/v3/codecs/pipeline.py @@ -16,13 +16,14 @@ from zarr.v3.abc.metadata import Metadata from zarr.v3.codecs.registry import get_codec_class from zarr.v3.common import parse_named_configuration +from zarr.v3.buffer import Buffer if TYPE_CHECKING: from typing import Iterator, List, Optional, Tuple, Union from zarr.v3.store import StorePath from zarr.v3.metadata import ArrayMetadata from zarr.v3.config import RuntimeConfiguration - from zarr.v3.common import JSON, ArraySpec, BytesLike, SliceSelection + from zarr.v3.common import JSON, ArraySpec, SliceSelection @dataclass(frozen=True) @@ -149,7 +150,7 @@ def _codecs_with_resolved_metadata( async def decode( self, - chunk_bytes: BytesLike, + chunk_bytes: Buffer, array_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> np.ndarray: @@ -188,7 +189,7 @@ async def encode( chunk_array: np.ndarray, array_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: + ) -> Optional[Buffer]: ( aa_codecs_with_spec, ab_codec_with_spec, @@ -217,6 +218,7 @@ async def encode( return None chunk_bytes = chunk_bytes_maybe + assert isinstance(chunk_bytes, Buffer) return chunk_bytes async def encode_partial( diff --git a/src/zarr/v3/codecs/sharding.py b/src/zarr/v3/codecs/sharding.py index 0385154c0f..25e440dbc8 100644 --- a/src/zarr/v3/codecs/sharding.py +++ b/src/zarr/v3/codecs/sharding.py @@ -37,6 +37,7 @@ runtime_configuration as make_runtime_configuration, parse_codecs, ) +from zarr.v3.buffer import Buffer, as_buffer if TYPE_CHECKING: from typing import Awaitable, Callable, Dict, Iterator, List, Optional, Set, Tuple @@ -46,7 +47,6 @@ from zarr.v3.common import ( JSON, ChunkCoords, - BytesLike, SliceSelection, ) from zarr.v3.config import RuntimeConfiguration @@ -128,15 +128,15 @@ def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardIndex: class _ShardProxy(Mapping): index: _ShardIndex - buf: BytesLike + buf: Buffer @classmethod async def from_bytes( - cls, buf: BytesLike, codec: ShardingCodec, chunks_per_shard: ChunkCoords + cls, buf: Buffer, codec: ShardingCodec, chunks_per_shard: ChunkCoords ) -> _ShardProxy: shard_index_size = codec._shard_index_size(chunks_per_shard) obj = cls() - obj.buf = memoryview(buf) + obj.buf = buf if codec.index_location == ShardingCodecIndexLocation.start: shard_index_bytes = obj.buf[:shard_index_size] else: @@ -149,11 +149,11 @@ async def from_bytes( def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardProxy: index = _ShardIndex.create_empty(chunks_per_shard) obj = cls() - obj.buf = memoryview(b"") + obj.buf = as_buffer(memoryview(b"")) obj.index = index return obj - def __getitem__(self, chunk_coords: ChunkCoords) -> Optional[BytesLike]: + def __getitem__(self, chunk_coords: ChunkCoords) -> Optional[Buffer]: chunk_byte_slice = self.index.get_chunk_slice(chunk_coords) if chunk_byte_slice: return self.buf[chunk_byte_slice[0] : chunk_byte_slice[1]] @@ -167,7 +167,7 @@ def __iter__(self) -> Iterator[ChunkCoords]: class _ShardBuilder(_ShardProxy): - buf: bytearray + buf: Buffer index: _ShardIndex @classmethod @@ -175,7 +175,7 @@ def merge_with_morton_order( cls, chunks_per_shard: ChunkCoords, tombstones: Set[ChunkCoords], - *shard_dicts: Mapping[ChunkCoords, BytesLike], + *shard_dicts: Mapping[ChunkCoords, Buffer], ) -> _ShardBuilder: obj = cls.create_empty(chunks_per_shard) for chunk_coords in morton_order_iter(chunks_per_shard): @@ -191,31 +191,29 @@ def merge_with_morton_order( @classmethod def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardBuilder: obj = cls() - obj.buf = bytearray() + obj.buf = Buffer(bytearray(0)) obj.index = _ShardIndex.create_empty(chunks_per_shard) return obj - def append(self, chunk_coords: ChunkCoords, value: BytesLike): + def append(self, chunk_coords: ChunkCoords, value: Buffer): chunk_start = len(self.buf) chunk_length = len(value) - self.buf.extend(value) + self.buf = self.buf + value self.index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length)) async def finalize( self, index_location: ShardingCodecIndexLocation, - index_encoder: Callable[[_ShardIndex], Awaitable[BytesLike]], - ) -> BytesLike: + index_encoder: Callable[[_ShardIndex], Awaitable[Buffer]], + ) -> Buffer: index_bytes = await index_encoder(self.index) if index_location == ShardingCodecIndexLocation.start: self.index.offsets_and_lengths[..., 0] += len(index_bytes) index_bytes = await index_encoder(self.index) # encode again with corrected offsets - out_buf = bytearray(index_bytes) - out_buf.extend(self.buf) + out_buf = index_bytes + self.buf else: - out_buf = self.buf - out_buf.extend(index_bytes) - return out_buf + out_buf = self.buf + index_bytes + return as_buffer(out_buf) @dataclass(frozen=True) @@ -300,7 +298,7 @@ def validate(self, array_metadata: ArrayMetadata) -> None: async def decode( self, - shard_bytes: BytesLike, + shard_bytes: Buffer, shard_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> np.ndarray: @@ -375,7 +373,7 @@ async def decode_partial( all_chunk_coords = set(chunk_coords for chunk_coords, _, _ in indexed_chunks) # reading bytes of all requested chunks - shard_dict: Mapping[ChunkCoords, BytesLike] = {} + shard_dict: Mapping[ChunkCoords, Buffer] = {} if self._is_total_shard(all_chunk_coords, chunks_per_shard): # read entire shard shard_dict_maybe = await self._load_full_shard_maybe(store_path, chunks_per_shard) @@ -417,7 +415,7 @@ async def decode_partial( async def _read_chunk( self, - shard_dict: Mapping[ChunkCoords, Optional[BytesLike]], + shard_dict: Mapping[ChunkCoords, Optional[Buffer]], chunk_coords: ChunkCoords, chunk_selection: SliceSelection, out_selection: SliceSelection, @@ -439,7 +437,7 @@ async def encode( shard_array: np.ndarray, shard_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: + ) -> Optional[Buffer]: shard_shape = shard_spec.shape chunk_shape = self.chunk_shape chunks_per_shard = self._get_chunks_per_shard(shard_spec) @@ -457,7 +455,7 @@ async def _write_chunk( chunk_coords: ChunkCoords, chunk_selection: SliceSelection, out_selection: SliceSelection, - ) -> Tuple[ChunkCoords, Optional[BytesLike]]: + ) -> Tuple[ChunkCoords, Optional[Buffer]]: if is_total_slice(chunk_selection, chunk_shape): chunk_array = shard_array[out_selection] else: @@ -477,7 +475,7 @@ async def _write_chunk( return (chunk_coords, None) # assembling and encoding chunks within the shard - encoded_chunks: List[Tuple[ChunkCoords, Optional[BytesLike]]] = await concurrent_map( + encoded_chunks: List[Tuple[ChunkCoords, Optional[Buffer]]] = await concurrent_map( [ (shard_array, chunk_coords, chunk_selection, out_selection) for chunk_coords, chunk_selection, out_selection in indexer @@ -527,7 +525,7 @@ async def _write_chunk( chunk_coords: ChunkCoords, chunk_selection: SliceSelection, out_selection: SliceSelection, - ) -> Tuple[ChunkCoords, Optional[BytesLike]]: + ) -> Tuple[ChunkCoords, Optional[Buffer]]: chunk_array = None if is_total_slice(chunk_selection, self.chunk_shape): chunk_array = shard_array[out_selection] @@ -557,7 +555,7 @@ async def _write_chunk( else: return (chunk_coords, None) - encoded_chunks: List[Tuple[ChunkCoords, Optional[BytesLike]]] = await concurrent_map( + encoded_chunks: List[Tuple[ChunkCoords, Optional[Buffer]]] = await concurrent_map( [ ( chunk_coords, @@ -601,7 +599,7 @@ def _is_total_shard( ) async def _decode_shard_index( - self, index_bytes: BytesLike, chunks_per_shard: ChunkCoords + self, index_bytes: Buffer, chunks_per_shard: ChunkCoords ) -> _ShardIndex: return _ShardIndex( await self.index_codecs.decode( @@ -611,13 +609,14 @@ async def _decode_shard_index( ) ) - async def _encode_shard_index(self, index: _ShardIndex) -> BytesLike: + async def _encode_shard_index(self, index: _ShardIndex) -> Buffer: index_bytes = await self.index_codecs.encode( index.offsets_and_lengths, self._get_index_chunk_spec(index.chunks_per_shard), make_runtime_configuration("C"), ) assert index_bytes is not None + assert isinstance(index_bytes, Buffer) return index_bytes def _shard_index_size(self, chunks_per_shard: ChunkCoords) -> int: diff --git a/src/zarr/v3/codecs/zstd.py b/src/zarr/v3/codecs/zstd.py index 774bb8bdbb..ebd60d5746 100644 --- a/src/zarr/v3/codecs/zstd.py +++ b/src/zarr/v3/codecs/zstd.py @@ -8,12 +8,13 @@ from zarr.v3.abc.codec import BytesBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import parse_named_configuration, to_thread +from zarr.v3.buffer import Buffer, as_bytes_wrapper if TYPE_CHECKING: from typing import Dict, Optional from typing_extensions import Self from zarr.v3.config import RuntimeConfiguration - from zarr.v3.common import BytesLike, JSON, ArraySpec + from zarr.v3.common import JSON, ArraySpec def parse_zstd_level(data: JSON) -> int: @@ -62,19 +63,19 @@ def _decompress(self, data: bytes) -> bytes: async def decode( self, - chunk_bytes: bytes, + chunk_bytes: Buffer, _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> BytesLike: - return await to_thread(self._decompress, chunk_bytes) + ) -> Buffer: + return await to_thread(as_bytes_wrapper, self._decompress, chunk_bytes) async def encode( self, - chunk_bytes: bytes, + chunk_bytes: Buffer, _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: - return await to_thread(self._compress, chunk_bytes) + ) -> Optional[Buffer]: + return await to_thread(as_bytes_wrapper, self._compress, chunk_bytes) def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError diff --git a/src/zarr/v3/group.py b/src/zarr/v3/group.py index fcd2fea215..d0f5754e6a 100644 --- a/src/zarr/v3/group.py +++ b/src/zarr/v3/group.py @@ -11,6 +11,7 @@ from zarr.v3.attributes import Attributes from zarr.v3.common import ZARR_JSON, ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON from zarr.v3.config import RuntimeConfiguration, SyncConfiguration +from zarr.v3.buffer import as_buffer from zarr.v3.store import StoreLike, StorePath, make_store_path from zarr.v3.sync import SyncMixin, sync @@ -113,7 +114,9 @@ async def open( # (it is optional in the case of implicit groups) zarr_json_bytes = await (store_path / ZARR_JSON).get() zarr_json = ( - json.loads(zarr_json_bytes) if zarr_json_bytes is not None else {"zarr_format": 3} + json.loads(zarr_json_bytes.as_bytearray()) + if zarr_json_bytes is not None + else {"zarr_format": 3} ) elif zarr_format == 2: @@ -123,11 +126,15 @@ async def open( (store_path / ZGROUP_JSON).get(), (store_path / ZATTRS_JSON).get() ) zgroup = ( - json.loads(json.loads(zgroup_bytes)) + json.loads(json.loads(zgroup_bytes.as_bytearray())) if zgroup_bytes is not None else {"zarr_format": 2} ) - zattrs = json.loads(json.loads(zattrs_bytes)) if zattrs_bytes is not None else {} + zattrs = ( + json.loads(json.loads(zattrs_bytes.as_bytearray())) + if zattrs_bytes is not None + else {} + ) zarr_json = {**zgroup, "attributes": zattrs} else: raise ValueError(f"unexpected zarr_format: {zarr_format}") @@ -164,7 +171,7 @@ async def getitem( "attributes": {}, } else: - zarr_json = json.loads(zarr_json_bytes) + zarr_json = json.loads(zarr_json_bytes.as_bytearray()) if zarr_json["node_type"] == "group": return type(self).from_dict(store_path, zarr_json, self.runtime_configuration) elif zarr_json["node_type"] == "array": @@ -183,9 +190,9 @@ async def getitem( ) # unpack the zarray, if this is None then we must be opening a group - zarray = json.loads(zarray_bytes) if zarray_bytes else None + zarray = json.loads(zarray_bytes.as_bytearray()) if zarray_bytes else None # unpack the zattrs, this can be None if no attrs were written - zattrs = json.loads(zattrs_bytes) if zattrs_bytes is not None else {} + zattrs = json.loads(zattrs_bytes.as_bytearray()) if zattrs_bytes is not None else {} if zarray is not None: # TODO: update this once the V2 array support is part of the primary array class @@ -198,7 +205,7 @@ async def getitem( # implicit group? logger.warning("group at {} is an implicit group", store_path) zgroup = ( - json.loads(zgroup_bytes) + json.loads(zgroup_bytes.as_bytearray()) if zgroup_bytes is not None else {"zarr_format": self.metadata.zarr_format} ) @@ -221,7 +228,9 @@ async def delitem(self, key: str) -> None: async def _save_metadata(self) -> None: to_save = self.metadata.to_bytes() - awaitables = [(self.store_path / key).set(value) for key, value in to_save.items()] + awaitables = [ + (self.store_path / key).set(as_buffer(value)) for key, value in to_save.items() + ] await asyncio.gather(*awaitables) @property @@ -257,9 +266,9 @@ async def update_attributes(self, new_attributes: Dict[str, Any]): to_save = self.metadata.to_bytes() if self.metadata.zarr_format == 2: # only save the .zattrs object - await (self.store_path / ZATTRS_JSON).set(to_save[ZATTRS_JSON]) + await (self.store_path / ZATTRS_JSON).set(as_buffer(to_save[ZATTRS_JSON])) else: - await (self.store_path / ZARR_JSON).set(to_save[ZARR_JSON]) + await (self.store_path / ZARR_JSON).set(as_buffer(to_save[ZARR_JSON])) self.metadata.attributes.clear() self.metadata.attributes.update(new_attributes) @@ -383,7 +392,9 @@ async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> Group # Write new metadata to_save = new_metadata.to_bytes() - awaitables = [(self.store_path / key).set(value) for key, value in to_save.items()] + awaitables = [ + (self.store_path / key).set(as_buffer(value)) for key, value in to_save.items() + ] await asyncio.gather(*awaitables) async_group = replace(self._async_group, metadata=new_metadata) diff --git a/src/zarr/v3/metadata.py b/src/zarr/v3/metadata.py index 573b8484f0..8543bbcb77 100644 --- a/src/zarr/v3/metadata.py +++ b/src/zarr/v3/metadata.py @@ -8,6 +8,7 @@ from zarr.v3.chunk_grids import ChunkGrid, RegularChunkGrid from zarr.v3.chunk_key_encodings import ChunkKeyEncoding, parse_separator +from zarr.v3.buffer import Buffer, as_buffer if TYPE_CHECKING: @@ -291,7 +292,7 @@ def __init__( def ndim(self) -> int: return len(self.shape) - def to_bytes(self) -> bytes: + def to_bytes(self) -> Buffer: def _json_convert(o): if isinstance(o, np.dtype): if o.fields is None: @@ -300,7 +301,7 @@ def _json_convert(o): return o.descr raise TypeError - return json.dumps(self.to_dict(), default=_json_convert).encode() + return as_buffer(json.dumps(self.to_dict(), default=_json_convert).encode()) @classmethod def from_dict(cls, data: Dict[str, Any]) -> ArrayV2Metadata: diff --git a/src/zarr/v3/store/core.py b/src/zarr/v3/store/core.py index 16714d9e30..f214afe0b6 100644 --- a/src/zarr/v3/store/core.py +++ b/src/zarr/v3/store/core.py @@ -3,8 +3,8 @@ from pathlib import Path from typing import Any, Optional, Tuple, Union -from zarr.v3.common import BytesLike from zarr.v3.abc.store import Store +from zarr.v3.buffer import Buffer from zarr.v3.store.local import LocalStore @@ -25,12 +25,10 @@ def __init__(self, store: Store, path: Optional[str] = None): self.store = store self.path = path or "" - async def get( - self, byte_range: Optional[Tuple[int, Optional[int]]] = None - ) -> Optional[BytesLike]: + async def get(self, byte_range: Optional[Tuple[int, Optional[int]]] = None) -> Optional[Buffer]: return await self.store.get(self.path, byte_range) - async def set(self, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None) -> None: + async def set(self, value: Buffer, byte_range: Optional[Tuple[int, int]] = None) -> None: if byte_range is not None: raise NotImplementedError("Store.set does not have partial writes yet") await self.store.set(self.path, value) diff --git a/src/zarr/v3/store/local.py b/src/zarr/v3/store/local.py index 8f02b904c0..502c52f155 100644 --- a/src/zarr/v3/store/local.py +++ b/src/zarr/v3/store/local.py @@ -6,15 +6,16 @@ from typing import Union, Optional, List, Tuple from zarr.v3.abc.store import Store -from zarr.v3.common import BytesLike, concurrent_map, to_thread +from zarr.v3.common import concurrent_map, to_thread +from zarr.v3.buffer import Buffer, as_buffer -def _get(path: Path, byte_range: Optional[Tuple[int, Optional[int]]] = None) -> bytes: +def _get(path: Path, byte_range: Optional[Tuple[int, Optional[int]]] = None) -> Buffer: if byte_range is not None: start = byte_range[0] end = (start + byte_range[1]) if byte_range[1] is not None else None else: - return path.read_bytes() + return as_buffer(path.read_bytes()) with path.open("rb") as f: size = f.seek(0, io.SEEK_END) if start is not None: @@ -25,13 +26,13 @@ def _get(path: Path, byte_range: Optional[Tuple[int, Optional[int]]] = None) -> if end is not None: if end < 0: end = size + end - return f.read(end - f.tell()) - return f.read() + return as_buffer(f.read(end - f.tell())) + return as_buffer(f.read()) def _put( path: Path, - value: BytesLike, + value: Buffer, start: Optional[int] = None, auto_mkdir: bool = True, ): @@ -40,9 +41,9 @@ def _put( if start is not None: with path.open("r+b") as f: f.seek(start) - f.write(value) + f.write(value.as_bytearray()) else: - return path.write_bytes(value) + return path.write_bytes(value.as_bytearray()) class LocalStore(Store): @@ -72,7 +73,7 @@ def __eq__(self, other: object) -> bool: async def get( self, key: str, byte_range: Optional[Tuple[int, Optional[int]]] = None - ) -> Optional[bytes]: + ) -> Optional[Buffer]: assert isinstance(key, str) path = self.root / key @@ -83,7 +84,7 @@ async def get( async def get_partial_values( self, key_ranges: List[Tuple[str, Tuple[int, int]]] - ) -> List[bytes]: + ) -> List[Buffer]: args = [] for key, byte_range in key_ranges: assert isinstance(key, str) @@ -94,7 +95,7 @@ async def get_partial_values( args.append((_get, path)) return await concurrent_map(args, to_thread, limit=None) # TODO: fix limit - async def set(self, key: str, value: BytesLike) -> None: + async def set(self, key: str, value: Buffer) -> None: assert isinstance(key, str) path = self.root / key await to_thread(_put, path, value) diff --git a/src/zarr/v3/store/memory.py b/src/zarr/v3/store/memory.py index afacfa4321..2b389dbef6 100644 --- a/src/zarr/v3/store/memory.py +++ b/src/zarr/v3/store/memory.py @@ -2,8 +2,8 @@ from typing import Optional, MutableMapping, List, Tuple -from zarr.v3.common import BytesLike from zarr.v3.abc.store import Store +from zarr.v3.buffer import Buffer # TODO: this store could easily be extended to wrap any MutuableMapping store from v2 @@ -13,9 +13,9 @@ class MemoryStore(Store): supports_partial_writes: bool = True supports_listing: bool = True - _store_dict: MutableMapping[str, bytes] + _store_dict: MutableMapping[str, Buffer] - def __init__(self, store_dict: Optional[MutableMapping[str, bytes]] = None): + def __init__(self, store_dict: Optional[MutableMapping[str, Buffer]] = None): self._store_dict = store_dict or {} def __str__(self) -> str: @@ -26,7 +26,7 @@ def __repr__(self) -> str: async def get( self, key: str, byte_range: Optional[Tuple[int, Optional[int]]] = None - ) -> Optional[BytesLike]: + ) -> Optional[Buffer]: assert isinstance(key, str) try: value = self._store_dict[key] @@ -38,21 +38,21 @@ async def get( async def get_partial_values( self, key_ranges: List[Tuple[str, Tuple[int, int]]] - ) -> List[bytes]: + ) -> List[Buffer]: raise NotImplementedError async def exists(self, key: str) -> bool: return key in self._store_dict async def set( - self, key: str, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None + self, key: str, value: Buffer, byte_range: Optional[Tuple[int, int]] = None ) -> None: assert isinstance(key, str) - if not isinstance(value, (bytes, bytearray, memoryview)): - raise TypeError(f"Expected BytesLike. Got {type(value)}.") + if not isinstance(value, Buffer): + raise TypeError(f"Expected Buffer. Got {type(value)}.") if byte_range is not None: - buf = bytearray(self._store_dict[key]) + buf = self._store_dict[key] buf[byte_range[0] : byte_range[1]] = value self._store_dict[key] = buf else: diff --git a/src/zarr/v3/store/remote.py b/src/zarr/v3/store/remote.py index 0e6fc84e08..e903eab888 100644 --- a/src/zarr/v3/store/remote.py +++ b/src/zarr/v3/store/remote.py @@ -3,8 +3,8 @@ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union from zarr.v3.abc.store import Store +from zarr.v3.buffer import Buffer from zarr.v3.store.core import _dereference_path -from zarr.v3.common import BytesLike if TYPE_CHECKING: @@ -52,7 +52,7 @@ def _make_fs(self) -> Tuple[AsyncFileSystem, str]: async def get( self, key: str, byte_range: Optional[Tuple[int, Optional[int]]] = None - ) -> Optional[BytesLike]: + ) -> Optional[Buffer]: assert isinstance(key, str) fs, root = self._make_fs() path = _dereference_path(root, key) @@ -69,7 +69,7 @@ async def get( return value async def set( - self, key: str, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None + self, key: str, value: Buffer, byte_range: Optional[Tuple[int, int]] = None ) -> None: assert isinstance(key, str) fs, root = self._make_fs() diff --git a/tests/v3/test_codecs.py b/tests/v3/test_codecs.py index 333c2094bf..d32cebbd6d 100644 --- a/tests/v3/test_codecs.py +++ b/tests/v3/test_codecs.py @@ -24,6 +24,7 @@ from zarr.v3.metadata import runtime_configuration from zarr.v3.abc.store import Store +from zarr.v3.buffer import as_bytearray from zarr.v3.store import MemoryStore, StorePath @@ -295,7 +296,7 @@ async def test_order( fill_value=1, ) z[:, :] = data - assert await (store / "order/0.0").get() == z._store["0.0"] + assert as_bytearray(await (store / "order/0.0").get()) == z._store["0.0"] @pytest.mark.parametrize("input_order", ["F", "C"]) @@ -671,10 +672,10 @@ async def test_zarr_compat(store: Store): assert np.array_equal(data, await _AsyncArrayProxy(a)[:16, :18].get()) assert np.array_equal(data, z2[:16, :18]) - assert z2._store["0.0"] == await (store / "zarr_compat3/0.0").get() - assert z2._store["0.1"] == await (store / "zarr_compat3/0.1").get() - assert z2._store["1.0"] == await (store / "zarr_compat3/1.0").get() - assert z2._store["1.1"] == await (store / "zarr_compat3/1.1").get() + assert z2._store["0.0"] == as_bytearray(await (store / "zarr_compat3/0.0").get()) + assert z2._store["0.1"] == as_bytearray(await (store / "zarr_compat3/0.1").get()) + assert z2._store["1.0"] == as_bytearray(await (store / "zarr_compat3/1.0").get()) + assert z2._store["1.1"] == as_bytearray(await (store / "zarr_compat3/1.1").get()) @pytest.mark.asyncio @@ -705,10 +706,10 @@ async def test_zarr_compat_F(store: Store): assert np.array_equal(data, await _AsyncArrayProxy(a)[:16, :18].get()) assert np.array_equal(data, z2[:16, :18]) - assert z2._store["0.0"] == await (store / "zarr_compatF3/0.0").get() - assert z2._store["0.1"] == await (store / "zarr_compatF3/0.1").get() - assert z2._store["1.0"] == await (store / "zarr_compatF3/1.0").get() - assert z2._store["1.1"] == await (store / "zarr_compatF3/1.1").get() + assert z2._store["0.0"] == as_bytearray(await (store / "zarr_compatF3/0.0").get()) + assert z2._store["0.1"] == as_bytearray(await (store / "zarr_compatF3/0.1").get()) + assert z2._store["1.0"] == as_bytearray(await (store / "zarr_compatF3/1.0").get()) + assert z2._store["1.1"] == as_bytearray(await (store / "zarr_compatF3/1.1").get()) @pytest.mark.asyncio @@ -738,7 +739,7 @@ async def test_dimension_names(store: Store): ) assert (await AsyncArray.open(store / "dimension_names2")).metadata.dimension_names is None - zarr_json_bytes = await (store / "dimension_names2" / "zarr.json").get() + zarr_json_bytes = as_bytearray(await (store / "dimension_names2" / "zarr.json").get()) assert zarr_json_bytes is not None assert "dimension_names" not in json.loads(zarr_json_bytes) @@ -804,7 +805,7 @@ async def test_endian(store: Store, endian: Literal["big", "little"]): fill_value=1, ) z[:, :] = data - assert await (store / "endian/0.0").get() == z._store["0.0"] + assert as_bytearray(await (store / "endian/0.0").get()) == z._store["0.0"] @pytest.mark.parametrize("dtype_input_endian", [">u2", " Date: Tue, 30 Apr 2024 13:30:17 +0200 Subject: [PATCH 02/45] use memoryview as the underlying memory --- src/zarr/v3/array.py | 4 ++-- src/zarr/v3/array_v2.py | 6 ++---- src/zarr/v3/buffer.py | 38 +++++++++++++++++++--------------- src/zarr/v3/codecs/crc32c_.py | 8 +++---- src/zarr/v3/codecs/sharding.py | 2 +- src/zarr/v3/group.py | 16 +++++++------- src/zarr/v3/store/local.py | 4 ++-- 7 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/zarr/v3/array.py b/src/zarr/v3/array.py index 54f0d03a95..003fa77194 100644 --- a/src/zarr/v3/array.py +++ b/src/zarr/v3/array.py @@ -151,7 +151,7 @@ async def open( assert zarr_json_bytes is not None return cls.from_dict( store_path, - json.loads(zarr_json_bytes.as_bytearray()), + json.loads(zarr_json_bytes.to_bytes()), runtime_configuration=runtime_configuration, ) @@ -166,7 +166,7 @@ async def open_auto( if v3_metadata_bytes is not None: return cls.from_dict( store_path, - json.loads(v3_metadata_bytes.as_bytearray()), + json.loads(v3_metadata_bytes.to_bytes()), runtime_configuration=runtime_configuration or RuntimeConfiguration(), ) else: diff --git a/src/zarr/v3/array_v2.py b/src/zarr/v3/array_v2.py index 4f6cbece8c..2229c739ce 100644 --- a/src/zarr/v3/array_v2.py +++ b/src/zarr/v3/array_v2.py @@ -153,10 +153,8 @@ async def open_async( assert zarray_bytes is not None return cls.from_dict( store_path, - zarray_json=json.loads(zarray_bytes.as_bytearray()), - zattrs_json=json.loads(zattrs_bytes.as_bytearray()) - if zattrs_bytes is not None - else None, + zarray_json=json.loads(zarray_bytes.to_bytes()), + zattrs_json=json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else None, runtime_configuration=runtime_configuration, ) diff --git a/src/zarr/v3/buffer.py b/src/zarr/v3/buffer.py index 669e9059c7..3850eff6d4 100644 --- a/src/zarr/v3/buffer.py +++ b/src/zarr/v3/buffer.py @@ -21,50 +21,54 @@ class Buffer(NDBuffer): of memory such as CUDA device memory. """ - def __init__(self, data: bytearray): - assert isinstance(data, bytearray) + def __init__(self, data: memoryview): + assert isinstance(data, memoryview) + assert data.ndim == 1 + assert data.contiguous + assert data.itemsize == 1 self._data = data - def as_bytearray(self) -> bytearray: + def memoryview(self) -> memoryview: return self._data + def to_bytes(self) -> bytes: + return bytes(self.memoryview()) + def as_numpy_array(self, dtype: np.DTypeLike) -> np.ndarray: - return np.frombuffer(self._data, dtype=dtype) + return np.frombuffer(self.memoryview(), dtype=dtype) def __getitem__(self, key) -> Self: - return self.__class__(self.as_bytearray().__getitem__(key)) + return self.__class__(self.memoryview().__getitem__(key)) def __setitem__(self, key, value) -> None: - self.as_bytearray().__setitem__(key, value) + self.memoryview().__setitem__(key, value) def __len__(self) -> int: - return len(self.as_bytearray()) + return len(self.memoryview()) def __add__(self, other: Buffer) -> Self: - return self.__class__(self.as_bytearray() + other.as_bytearray()) + return self.__class__(memoryview(self.to_bytes() + other.to_bytes())) def as_buffer(data: Any) -> Buffer: if isinstance(data, Buffer): return data - if isinstance(data, bytearray): - return Buffer(data) - if isinstance(data, bytes): - return Buffer(bytearray(data)) + if isinstance(data, bytearray | bytes): + return Buffer(memoryview(data)) if hasattr(data, "to_bytes"): - return as_buffer(data.to_bytes()) - return Buffer(bytearray(np.asarray(data))) + return as_buffer(memoryview(data.to_bytes())) + return Buffer(memoryview(np.asanyarray(data).reshape(-1).view(dtype="int8"))) def as_bytes_wrapper(func, buf: Buffer) -> Buffer: - return as_buffer(func(buf.as_bytearray())) + return as_buffer(func(buf.to_bytes())) def return_as_bytes_wrapper(func, *arg, **kwargs) -> Buffer: return as_buffer(func(*arg, **kwargs)) -def as_bytearray(data: Optional[Buffer]): +def as_bytearray(data: Optional[Buffer]) -> Optional[bytes]: if data is None: return data - return data.as_bytearray() + return data.to_bytes() diff --git a/src/zarr/v3/codecs/crc32c_.py b/src/zarr/v3/codecs/crc32c_.py index 5677018f5e..ba32455199 100644 --- a/src/zarr/v3/codecs/crc32c_.py +++ b/src/zarr/v3/codecs/crc32c_.py @@ -10,7 +10,7 @@ from zarr.v3.abc.codec import BytesBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import parse_named_configuration -from zarr.v3.buffer import Buffer +from zarr.v3.buffer import Buffer, as_buffer if TYPE_CHECKING: from typing import Dict, Optional @@ -37,7 +37,7 @@ async def decode( _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, ) -> Buffer: - data = chunk_bytes.as_bytearray() + data = chunk_bytes.memoryview() crc32_bytes = data[-4:] inner_bytes = data[:-4] @@ -56,8 +56,8 @@ async def encode( _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, ) -> Optional[Buffer]: - bytes = chunk_bytes.as_bytearray() - return Buffer(bytes + np.uint32(crc32c(bytes)).tobytes()) + checksum = crc32c(chunk_bytes.memoryview()) + return as_buffer(chunk_bytes.to_bytes() + np.uint32(checksum).tobytes()) def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length + 4 diff --git a/src/zarr/v3/codecs/sharding.py b/src/zarr/v3/codecs/sharding.py index 25e440dbc8..849103a38e 100644 --- a/src/zarr/v3/codecs/sharding.py +++ b/src/zarr/v3/codecs/sharding.py @@ -191,7 +191,7 @@ def merge_with_morton_order( @classmethod def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardBuilder: obj = cls() - obj.buf = Buffer(bytearray(0)) + obj.buf = Buffer(memoryview(b"")) obj.index = _ShardIndex.create_empty(chunks_per_shard) return obj diff --git a/src/zarr/v3/group.py b/src/zarr/v3/group.py index d0f5754e6a..7cf51f673e 100644 --- a/src/zarr/v3/group.py +++ b/src/zarr/v3/group.py @@ -114,7 +114,7 @@ async def open( # (it is optional in the case of implicit groups) zarr_json_bytes = await (store_path / ZARR_JSON).get() zarr_json = ( - json.loads(zarr_json_bytes.as_bytearray()) + json.loads(zarr_json_bytes.to_bytes()) if zarr_json_bytes is not None else {"zarr_format": 3} ) @@ -126,14 +126,12 @@ async def open( (store_path / ZGROUP_JSON).get(), (store_path / ZATTRS_JSON).get() ) zgroup = ( - json.loads(json.loads(zgroup_bytes.as_bytearray())) + json.loads(json.loads(zgroup_bytes.to_bytes())) if zgroup_bytes is not None else {"zarr_format": 2} ) zattrs = ( - json.loads(json.loads(zattrs_bytes.as_bytearray())) - if zattrs_bytes is not None - else {} + json.loads(json.loads(zattrs_bytes.to_bytes())) if zattrs_bytes is not None else {} ) zarr_json = {**zgroup, "attributes": zattrs} else: @@ -171,7 +169,7 @@ async def getitem( "attributes": {}, } else: - zarr_json = json.loads(zarr_json_bytes.as_bytearray()) + zarr_json = json.loads(zarr_json_bytes.to_bytes()) if zarr_json["node_type"] == "group": return type(self).from_dict(store_path, zarr_json, self.runtime_configuration) elif zarr_json["node_type"] == "array": @@ -190,9 +188,9 @@ async def getitem( ) # unpack the zarray, if this is None then we must be opening a group - zarray = json.loads(zarray_bytes.as_bytearray()) if zarray_bytes else None + zarray = json.loads(zarray_bytes.to_bytes()) if zarray_bytes else None # unpack the zattrs, this can be None if no attrs were written - zattrs = json.loads(zattrs_bytes.as_bytearray()) if zattrs_bytes is not None else {} + zattrs = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {} if zarray is not None: # TODO: update this once the V2 array support is part of the primary array class @@ -205,7 +203,7 @@ async def getitem( # implicit group? logger.warning("group at {} is an implicit group", store_path) zgroup = ( - json.loads(zgroup_bytes.as_bytearray()) + json.loads(zgroup_bytes.to_bytes()) if zgroup_bytes is not None else {"zarr_format": self.metadata.zarr_format} ) diff --git a/src/zarr/v3/store/local.py b/src/zarr/v3/store/local.py index 502c52f155..b9ab68e53c 100644 --- a/src/zarr/v3/store/local.py +++ b/src/zarr/v3/store/local.py @@ -41,9 +41,9 @@ def _put( if start is not None: with path.open("r+b") as f: f.seek(start) - f.write(value.as_bytearray()) + f.write(value.memoryview()) else: - return path.write_bytes(value.as_bytearray()) + return path.write_bytes(value.memoryview()) class LocalStore(Store): From eb6d097946916e4956ee8bcece7baab2ca1d2bf7 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 1 May 2024 16:39:46 +0200 Subject: [PATCH 03/45] use NDBuffer --- src/zarr/v3/abc/codec.py | 19 +++-- src/zarr/v3/array.py | 37 ++++++---- src/zarr/v3/array_v2.py | 14 ++-- src/zarr/v3/buffer.py | 120 ++++++++++++++++++++++++++------ src/zarr/v3/codecs/bytes.py | 20 ++---- src/zarr/v3/codecs/crc32c_.py | 2 +- src/zarr/v3/codecs/pipeline.py | 11 ++- src/zarr/v3/codecs/sharding.py | 57 +++++++-------- src/zarr/v3/codecs/transpose.py | 10 +-- 9 files changed, 182 insertions(+), 108 deletions(-) diff --git a/src/zarr/v3/abc/codec.py b/src/zarr/v3/abc/codec.py index 796f321465..4dd2207e09 100644 --- a/src/zarr/v3/abc/codec.py +++ b/src/zarr/v3/abc/codec.py @@ -3,11 +3,10 @@ from abc import abstractmethod from typing import TYPE_CHECKING, Optional -import numpy as np from zarr.v3.abc.metadata import Metadata from zarr.v3.common import ArraySpec -from zarr.v3.buffer import Buffer +from zarr.v3.buffer import Buffer, NDBuffer from zarr.v3.store import StorePath @@ -39,19 +38,19 @@ class ArrayArrayCodec(Codec): @abstractmethod async def decode( self, - chunk_array: np.ndarray, + chunk_array: NDBuffer, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> np.ndarray: + ) -> NDBuffer: pass @abstractmethod async def encode( self, - chunk_array: np.ndarray, + chunk_array: NDBuffer, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> Optional[np.ndarray]: + ) -> Optional[NDBuffer]: pass @@ -62,13 +61,13 @@ async def decode( chunk_array: Buffer, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> np.ndarray: + ) -> NDBuffer: pass @abstractmethod async def encode( self, - chunk_array: np.ndarray, + chunk_array: NDBuffer, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> Optional[Buffer]: @@ -83,7 +82,7 @@ async def decode_partial( selection: SliceSelection, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> Optional[np.ndarray]: + ) -> Optional[NDBuffer]: pass @@ -92,7 +91,7 @@ class ArrayBytesCodecPartialEncodeMixin: async def encode_partial( self, store_path: StorePath, - chunk_array: np.ndarray, + chunk_array: NDBuffer, selection: SliceSelection, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, diff --git a/src/zarr/v3/array.py b/src/zarr/v3/array.py index 003fa77194..550b978f91 100644 --- a/src/zarr/v3/array.py +++ b/src/zarr/v3/array.py @@ -35,7 +35,7 @@ from zarr.v3.chunk_grids import RegularChunkGrid from zarr.v3.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.v3.metadata import ArrayMetadata -from zarr.v3.buffer import as_buffer +from zarr.v3.buffer import NDBuffer, as_buffer, as_nd_buffer from zarr.v3.store import StoreLike, StorePath, make_store_path from zarr.v3.sync import sync @@ -202,8 +202,8 @@ async def getitem(self, selection: Selection): ) # setup output array - out = np.zeros( - indexer.shape, + out = NDBuffer.create_zeros( + shape=indexer.shape, dtype=self.metadata.dtype, order=self.runtime_configuration.order, ) @@ -218,10 +218,11 @@ async def getitem(self, selection: Selection): self.runtime_configuration.concurrency, ) + # We always return a numpy array to the user if out.shape: - return out + return out.as_numpy_array() else: - return out[()] + return out.as_numpy_array()[()] async def _save_metadata(self) -> None: await (self.store_path / ZARR_JSON).set(as_buffer(self.metadata.to_bytes())) @@ -231,7 +232,7 @@ async def _read_chunk( chunk_coords: ChunkCoords, chunk_selection: SliceSelection, out_selection: SliceSelection, - out: np.ndarray, + out: NDBuffer, ): chunk_spec = self.metadata.get_chunk_spec(chunk_coords) chunk_key_encoding = self.metadata.chunk_key_encoding @@ -258,6 +259,7 @@ async def _read_chunk( out[out_selection] = self.metadata.fill_value async def setitem(self, selection: Selection, value: np.ndarray) -> None: + assert isinstance(value, np.ndarray) assert isinstance(self.metadata.chunk_grid, RegularChunkGrid) chunk_shape = self.metadata.chunk_grid.chunk_shape indexer = BasicIndexer( @@ -279,6 +281,10 @@ async def setitem(self, selection: Selection, value: np.ndarray) -> None: if value.dtype.name != self.metadata.dtype.name: value = value.astype(self.metadata.dtype, order="A") + # We accept a numpy array as input from the user and convert it to a NDBuffer. + # From this point onwards, we only pass Buffer and NDBuffer between components. + value = as_nd_buffer(value) + # merging with existing data and encoding chunks await concurrent_map( [ @@ -297,12 +303,13 @@ async def setitem(self, selection: Selection, value: np.ndarray) -> None: async def _write_chunk( self, - value: np.ndarray, + value: NDBuffer, chunk_shape: ChunkCoords, chunk_coords: ChunkCoords, chunk_selection: SliceSelection, out_selection: SliceSelection, ): + assert isinstance(value, NDBuffer) chunk_spec = self.metadata.get_chunk_spec(chunk_coords) chunk_key_encoding = self.metadata.chunk_key_encoding chunk_key = chunk_key_encoding.encode_chunk_key(chunk_coords) @@ -311,8 +318,8 @@ async def _write_chunk( if is_total_slice(chunk_selection, chunk_shape): # write entire chunks if np.isscalar(value): - chunk_array = np.empty( - chunk_shape, + chunk_array = NDBuffer.create_empty( + shape=chunk_shape, dtype=self.metadata.dtype, ) chunk_array.fill(value) @@ -336,8 +343,8 @@ async def _write_chunk( # merge new value if chunk_bytes is None: - chunk_array = np.empty( - chunk_shape, + chunk_array = NDBuffer.create_empty( + shape=chunk_shape, dtype=self.metadata.dtype, ) chunk_array.fill(self.metadata.fill_value) @@ -350,9 +357,9 @@ async def _write_chunk( await self._write_chunk_to_store(store_path, chunk_array, chunk_spec) async def _write_chunk_to_store( - self, store_path: StorePath, chunk_array: np.ndarray, chunk_spec: ArraySpec + self, store_path: StorePath, chunk_array: NDBuffer, chunk_spec: ArraySpec ): - if np.all(chunk_array == self.metadata.fill_value): + if np.all(chunk_array.as_numpy_array() == self.metadata.fill_value): # chunks that only contain fill_value will be removed await store_path.delete() else: @@ -393,14 +400,14 @@ async def _delete_key(key: str) -> None: ) # Write new metadata - await (self.store_path / ZARR_JSON).set(as_buffer(new_metadata)) + await (self.store_path / ZARR_JSON).set(as_buffer(new_metadata.to_bytes())) return replace(self, metadata=new_metadata) async def update_attributes(self, new_attributes: Dict[str, Any]) -> AsyncArray: new_metadata = replace(self.metadata, attributes=new_attributes) # Write new metadata - await (self.store_path / ZARR_JSON).set(as_buffer(new_metadata)) + await (self.store_path / ZARR_JSON).set(as_buffer(new_metadata.to_bytes())) return replace(self, metadata=new_metadata) def __repr__(self): diff --git a/src/zarr/v3/array_v2.py b/src/zarr/v3/array_v2.py index 2229c739ce..79a5b2ecad 100644 --- a/src/zarr/v3/array_v2.py +++ b/src/zarr/v3/array_v2.py @@ -23,7 +23,7 @@ from zarr.v3.config import RuntimeConfiguration from zarr.v3.indexing import BasicIndexer, all_chunk_coords, is_total_slice from zarr.v3.metadata import ArrayV2Metadata -from zarr.v3.buffer import as_buffer, as_bytearray +from zarr.v3.buffer import NDBuffer, as_buffer, as_bytearray from zarr.v3.store import StoreLike, StorePath, make_store_path from zarr.v3.sync import sync @@ -230,8 +230,8 @@ async def get_async(self, selection: Selection): ) # setup output array - out = np.zeros( - indexer.shape, + out = NDBuffer.create_zeros( + shape=indexer.shape, dtype=self.metadata.dtype, order=self.metadata.order, ) @@ -347,8 +347,8 @@ async def _write_chunk( if is_total_slice(chunk_selection, chunk_shape): # write entire chunks if np.isscalar(value): - chunk_array = np.empty( - chunk_shape, + chunk_array = NDBuffer.create_empty( + shape=chunk_shape, dtype=self.metadata.dtype, order=self.metadata.order, ) @@ -364,8 +364,8 @@ async def _write_chunk( # merge new value if tmp is None: - chunk_array = np.empty( - chunk_shape, + chunk_array = NDBuffer.create_empty( + shape=chunk_shape, dtype=self.metadata.dtype, order=self.metadata.order, ) diff --git a/src/zarr/v3/buffer.py b/src/zarr/v3/buffer.py index 3850eff6d4..e9affa3877 100644 --- a/src/zarr/v3/buffer.py +++ b/src/zarr/v3/buffer.py @@ -1,16 +1,83 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Optional +import sys +from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional, Tuple import numpy as np if TYPE_CHECKING: from typing_extensions import Self + from zarr.v3.codecs.bytes import Endian class NDBuffer: - # TODO: replace np.ndarray with this n-dimensional buffer - pass + def __init__(self, array: np.ndarray): + assert isinstance(array, np.ndarray) + assert array.dtype != object + self._data = array + + @classmethod + def create_empty( + cls, *, shape: Iterable[int], dtype: np.DTypeLike, order: Literal["C", "F"] = "C" + ): + return cls(np.empty(shape=shape, dtype=dtype, order=order)) + + @classmethod + def create_zeros( + cls, *, shape: Iterable[int], dtype: np.DTypeLike, order: Literal["C", "F"] = "C" + ): + return cls(np.zeros(shape=shape, dtype=dtype, order=order)) + + def as_numpy_array(self, dtype: Optional[np.DTypeLike] = None) -> np.ndarray: + if dtype is None: + return self._data + return self._data.astype(dtype=dtype, copy=False) + + @property + def dtype(self) -> np.DTypeLike: + return self.as_numpy_array().dtype + + @property + def shape(self) -> Tuple[int, ...]: + return self.as_numpy_array().shape + + @property + def byteorder(self) -> Endian: + from zarr.v3.codecs.bytes import Endian + + if self.dtype.byteorder == "<": + return Endian.little + elif self.dtype.byteorder == ">": + return Endian.big + else: + return Endian(sys.byteorder) + + def reshape(self, newshape: Iterable[int]) -> Self: + return self.__class__(self.as_numpy_array().reshape(newshape)) + + def astype(self, dtype: np.DTypeLike, order: Literal["K", "A", "C", "F"] = "K") -> Self: + return self.__class__(self.as_numpy_array().astype(dtype=dtype, order=order)) + + def __getitem__(self, key) -> Self: + # print("__getitem__: \n", np.asanyarray(self.as_numpy_array().__getitem__(key))) + return self.__class__(np.asanyarray(self.as_numpy_array().__getitem__(key))) + + def __setitem__(self, key, value) -> None: + if isinstance(value, NDBuffer): + value = value.as_numpy_array() + self.as_numpy_array().__setitem__(key, value) + + def __len__(self) -> int: + return self.as_numpy_array().__len__() + + def fill(self, value: Any) -> None: + self.as_numpy_array().fill(value) + + def copy(self) -> Self: + return self.__class__(self.as_numpy_array().copy()) + + def transpose(self, *axes) -> Self: + return self.__class__(self.as_numpy_array().transpose(*axes)) class Buffer(NDBuffer): @@ -21,43 +88,52 @@ class Buffer(NDBuffer): of memory such as CUDA device memory. """ - def __init__(self, data: memoryview): - assert isinstance(data, memoryview) - assert data.ndim == 1 - assert data.contiguous - assert data.itemsize == 1 - self._data = data + @classmethod + def create_empty( + cls, *, shape: Iterable[int], dtype: np.DTypeLike = "b", order: Literal["C", "F"] = "C" + ): + return cls(np.empty(shape=shape, dtype=dtype, order=order)) def memoryview(self) -> memoryview: - return self._data + return memoryview(self._data.reshape(-1).view(dtype="b")) + + def as_numpy_array(self, dtype: Optional[np.DTypeLike] = "b") -> np.ndarray: + return self._data.reshape(-1).view(dtype=dtype) def to_bytes(self) -> bytes: return bytes(self.memoryview()) - def as_numpy_array(self, dtype: np.DTypeLike) -> np.ndarray: - return np.frombuffer(self.memoryview(), dtype=dtype) - def __getitem__(self, key) -> Self: - return self.__class__(self.memoryview().__getitem__(key)) + return self.__class__(self.as_numpy_array().__getitem__(key)) def __setitem__(self, key, value) -> None: - self.memoryview().__setitem__(key, value) + self.as_numpy_array().__setitem__(key, value) def __len__(self) -> int: - return len(self.memoryview()) + return self._data.nbytes def __add__(self, other: Buffer) -> Self: - return self.__class__(memoryview(self.to_bytes() + other.to_bytes())) + return self.__class__(np.frombuffer(self.to_bytes() + other.to_bytes(), dtype="b")) + + +def as_nd_buffer(data: Any) -> NDBuffer: + if isinstance(data, NDBuffer): + return data + return NDBuffer(np.asanyarray(data)) + + +def as_ndarray(data: Optional[NDBuffer]) -> Optional[np.ndarray]: + if data is None: + return data + return data.as_numpy_array() def as_buffer(data: Any) -> Buffer: if isinstance(data, Buffer): return data - if isinstance(data, bytearray | bytes): - return Buffer(memoryview(data)) - if hasattr(data, "to_bytes"): - return as_buffer(memoryview(data.to_bytes())) - return Buffer(memoryview(np.asanyarray(data).reshape(-1).view(dtype="int8"))) + if isinstance(data, NDBuffer): + return Buffer(data.as_numpy_array()) + return Buffer(np.asanyarray(data)) def as_bytes_wrapper(func, buf: Buffer) -> Buffer: diff --git a/src/zarr/v3/codecs/bytes.py b/src/zarr/v3/codecs/bytes.py index 6ae9e7ddda..f0e5c04d30 100644 --- a/src/zarr/v3/codecs/bytes.py +++ b/src/zarr/v3/codecs/bytes.py @@ -10,7 +10,7 @@ from zarr.v3.abc.codec import ArrayBytesCodec from zarr.v3.codecs.registry import register_codec from zarr.v3.common import parse_enum, parse_named_configuration -from zarr.v3.buffer import Buffer, as_buffer +from zarr.v3.buffer import Buffer, NDBuffer, as_buffer if TYPE_CHECKING: from zarr.v3.common import JSON, ArraySpec @@ -61,20 +61,12 @@ def evolve(self, array_spec: ArraySpec) -> Self: ) return self - def _get_byteorder(self, array: np.ndarray) -> Endian: - if array.dtype.byteorder == "<": - return Endian.little - elif array.dtype.byteorder == ">": - return Endian.big - else: - return default_system_endian - async def decode( self, chunk_bytes: Buffer, chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> np.ndarray: + ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) if chunk_spec.dtype.itemsize > 0: if self.endian == Endian.little: @@ -84,7 +76,7 @@ async def decode( dtype = np.dtype(f"{prefix}{chunk_spec.dtype.str[1:]}") else: dtype = np.dtype(f"|{chunk_spec.dtype.str[1:]}") - chunk_array = chunk_bytes.as_numpy_array(dtype) + chunk_array = NDBuffer(chunk_bytes.as_numpy_array(dtype)) # ensure correct chunk shape if chunk_array.shape != chunk_spec.shape: @@ -95,13 +87,13 @@ async def decode( async def encode( self, - chunk_array: np.ndarray, + chunk_array: NDBuffer, _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, ) -> Optional[Buffer]: + assert isinstance(chunk_array, NDBuffer) if chunk_array.dtype.itemsize > 1: - byteorder = self._get_byteorder(chunk_array) - if self.endian is not None and self.endian != byteorder: + if self.endian is not None and self.endian != chunk_array.byteorder: new_dtype = chunk_array.dtype.newbyteorder(self.endian.name) chunk_array = chunk_array.astype(new_dtype) return as_buffer(chunk_array) diff --git a/src/zarr/v3/codecs/crc32c_.py b/src/zarr/v3/codecs/crc32c_.py index ba32455199..4d174eaed0 100644 --- a/src/zarr/v3/codecs/crc32c_.py +++ b/src/zarr/v3/codecs/crc32c_.py @@ -48,7 +48,7 @@ async def decode( "Stored and computed checksum do not match. " + f"Stored: {stored_checksum!r}. Computed: {computed_checksum!r}." ) - return Buffer(inner_bytes) + return as_buffer(inner_bytes) async def encode( self, diff --git a/src/zarr/v3/codecs/pipeline.py b/src/zarr/v3/codecs/pipeline.py index 91ba1926cc..bd68f58e36 100644 --- a/src/zarr/v3/codecs/pipeline.py +++ b/src/zarr/v3/codecs/pipeline.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING, Iterable -import numpy as np from dataclasses import dataclass from warnings import warn @@ -16,7 +15,7 @@ from zarr.v3.abc.metadata import Metadata from zarr.v3.codecs.registry import get_codec_class from zarr.v3.common import parse_named_configuration -from zarr.v3.buffer import Buffer +from zarr.v3.buffer import Buffer, NDBuffer if TYPE_CHECKING: from typing import Iterator, List, Optional, Tuple, Union @@ -153,7 +152,7 @@ async def decode( chunk_bytes: Buffer, array_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> np.ndarray: + ) -> NDBuffer: ( aa_codecs_with_spec, ab_codec_with_spec, @@ -177,7 +176,7 @@ async def decode_partial( selection: SliceSelection, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> Optional[np.ndarray]: + ) -> Optional[NDBuffer]: assert self.supports_partial_decode assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) return await self.array_bytes_codec.decode_partial( @@ -186,7 +185,7 @@ async def decode_partial( async def encode( self, - chunk_array: np.ndarray, + chunk_array: NDBuffer, array_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> Optional[Buffer]: @@ -224,7 +223,7 @@ async def encode( async def encode_partial( self, store_path: StorePath, - chunk_array: np.ndarray, + chunk_array: NDBuffer, selection: SliceSelection, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, diff --git a/src/zarr/v3/codecs/sharding.py b/src/zarr/v3/codecs/sharding.py index 849103a38e..fbb037ef91 100644 --- a/src/zarr/v3/codecs/sharding.py +++ b/src/zarr/v3/codecs/sharding.py @@ -37,7 +37,7 @@ runtime_configuration as make_runtime_configuration, parse_codecs, ) -from zarr.v3.buffer import Buffer, as_buffer +from zarr.v3.buffer import Buffer, NDBuffer, as_buffer, as_nd_buffer if TYPE_CHECKING: from typing import Awaitable, Callable, Dict, Iterator, List, Optional, Set, Tuple @@ -149,7 +149,7 @@ async def from_bytes( def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardProxy: index = _ShardIndex.create_empty(chunks_per_shard) obj = cls() - obj.buf = as_buffer(memoryview(b"")) + obj.buf = as_buffer(np.array([], dtype="b")) obj.index = index return obj @@ -191,7 +191,7 @@ def merge_with_morton_order( @classmethod def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardBuilder: obj = cls() - obj.buf = Buffer(memoryview(b"")) + obj.buf = as_buffer(np.array([], dtype="b")) obj.index = _ShardIndex.create_empty(chunks_per_shard) return obj @@ -213,7 +213,7 @@ async def finalize( out_buf = index_bytes + self.buf else: out_buf = self.buf + index_bytes - return as_buffer(out_buf) + return out_buf @dataclass(frozen=True) @@ -301,7 +301,7 @@ async def decode( shard_bytes: Buffer, shard_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> np.ndarray: + ) -> NDBuffer: # print("decode") shard_shape = shard_spec.shape chunk_shape = self.chunk_shape @@ -314,8 +314,8 @@ async def decode( ) # setup output array - out = np.zeros( - shard_shape, + out = NDBuffer.create_zeros( + shape=shard_shape, dtype=shard_spec.dtype, order=runtime_configuration.order, ) @@ -351,7 +351,7 @@ async def decode_partial( selection: SliceSelection, shard_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - ) -> Optional[np.ndarray]: + ) -> Optional[NDBuffer]: shard_shape = shard_spec.shape chunk_shape = self.chunk_shape chunks_per_shard = self._get_chunks_per_shard(shard_spec) @@ -363,8 +363,8 @@ async def decode_partial( ) # setup output array - out = np.zeros( - indexer.shape, + out = NDBuffer.create_zeros( + shape=indexer.shape, dtype=shard_spec.dtype, order=runtime_configuration.order, ) @@ -410,7 +410,6 @@ async def decode_partial( self._read_chunk, runtime_configuration.concurrency, ) - return out async def _read_chunk( @@ -421,7 +420,7 @@ async def _read_chunk( out_selection: SliceSelection, shard_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, - out: np.ndarray, + out: NDBuffer, ): chunk_spec = self._get_chunk_spec(shard_spec) chunk_bytes = shard_dict.get(chunk_coords, None) @@ -434,7 +433,7 @@ async def _read_chunk( async def encode( self, - shard_array: np.ndarray, + shard_array: NDBuffer, shard_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> Optional[Buffer]: @@ -451,22 +450,23 @@ async def encode( ) async def _write_chunk( - shard_array: np.ndarray, + shard_array: NDBuffer, chunk_coords: ChunkCoords, chunk_selection: SliceSelection, out_selection: SliceSelection, ) -> Tuple[ChunkCoords, Optional[Buffer]]: + assert isinstance(shard_array, NDBuffer) if is_total_slice(chunk_selection, chunk_shape): chunk_array = shard_array[out_selection] else: # handling writing partial chunks - chunk_array = np.empty( - chunk_shape, + chunk_array = NDBuffer.create_empty( + shape=chunk_shape, dtype=shard_spec.dtype, ) chunk_array.fill(shard_spec.fill_value) chunk_array[chunk_selection] = shard_array[out_selection] - if not np.array_equiv(chunk_array, shard_spec.fill_value): + if not np.array_equiv(chunk_array.as_numpy_array(), shard_spec.fill_value): chunk_spec = self._get_chunk_spec(shard_spec) return ( chunk_coords, @@ -496,7 +496,7 @@ async def _write_chunk( async def encode_partial( self, store_path: StorePath, - shard_array: np.ndarray, + shard_array: NDBuffer, selection: SliceSelection, shard_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, @@ -526,7 +526,6 @@ async def _write_chunk( chunk_selection: SliceSelection, out_selection: SliceSelection, ) -> Tuple[ChunkCoords, Optional[Buffer]]: - chunk_array = None if is_total_slice(chunk_selection, self.chunk_shape): chunk_array = shard_array[out_selection] else: @@ -536,8 +535,8 @@ async def _write_chunk( # merge new value if chunk_bytes is None: - chunk_array = np.empty( - self.chunk_shape, + chunk_array = NDBuffer.create_empty( + shape=self.chunk_shape, dtype=shard_spec.dtype, ) chunk_array.fill(shard_spec.fill_value) @@ -547,7 +546,7 @@ async def _write_chunk( ).copy() # make a writable copy chunk_array[chunk_selection] = shard_array[out_selection] - if not np.array_equiv(chunk_array, shard_spec.fill_value): + if not np.array_equiv(chunk_array.as_numpy_array(), shard_spec.fill_value): return ( chunk_coords, await self.codecs.encode(chunk_array, chunk_spec, runtime_configuration), @@ -602,16 +601,18 @@ async def _decode_shard_index( self, index_bytes: Buffer, chunks_per_shard: ChunkCoords ) -> _ShardIndex: return _ShardIndex( - await self.index_codecs.decode( - index_bytes, - self._get_index_chunk_spec(chunks_per_shard), - make_runtime_configuration("C"), - ) + ( + await self.index_codecs.decode( + index_bytes, + self._get_index_chunk_spec(chunks_per_shard), + make_runtime_configuration("C"), + ) + ).as_numpy_array() ) async def _encode_shard_index(self, index: _ShardIndex) -> Buffer: index_bytes = await self.index_codecs.encode( - index.offsets_and_lengths, + as_nd_buffer(index.offsets_and_lengths), self._get_index_chunk_spec(index.chunks_per_shard), make_runtime_configuration("C"), ) diff --git a/src/zarr/v3/codecs/transpose.py b/src/zarr/v3/codecs/transpose.py index b663230e35..b09072705e 100644 --- a/src/zarr/v3/codecs/transpose.py +++ b/src/zarr/v3/codecs/transpose.py @@ -3,6 +3,7 @@ from dataclasses import dataclass, replace +from zarr.v3.buffer import NDBuffer from zarr.v3.common import JSON, ArraySpec, ChunkCoordsLike, parse_named_configuration if TYPE_CHECKING: @@ -10,7 +11,6 @@ from typing import TYPE_CHECKING, Optional, Tuple from typing_extensions import Self -import numpy as np from zarr.v3.abc.codec import ArrayArrayCodec from zarr.v3.codecs.registry import register_codec @@ -75,10 +75,10 @@ def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: async def decode( self, - chunk_array: np.ndarray, + chunk_array: NDBuffer, chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> np.ndarray: + ) -> NDBuffer: inverse_order = [0] * chunk_spec.ndim for x, i in enumerate(self.order): inverse_order[x] = i @@ -87,10 +87,10 @@ async def decode( async def encode( self, - chunk_array: np.ndarray, + chunk_array: NDBuffer, chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, - ) -> Optional[np.ndarray]: + ) -> Optional[NDBuffer]: chunk_array = chunk_array.transpose(self.order) return chunk_array From 2982c9baf5303fd21b57346bf0756f20e03ab6e0 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 2 May 2024 09:32:12 +0200 Subject: [PATCH 04/45] convert to Buffer for the v2 tests --- src/zarr/buffer.py | 7 +++++++ src/zarr/store/local.py | 5 +++++ src/zarr/store/memory.py | 5 ++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 644668c104..68de0a63e7 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -115,6 +115,13 @@ def __len__(self) -> int: def __add__(self, other: Buffer) -> Self: return self.__class__(np.frombuffer(self.to_bytes() + other.to_bytes(), dtype="b")) + def __eq__(self, other: Any) -> bool: + if isinstance(other, (bytes, bytearray)): + return self.to_bytes() == other + raise ValueError( + f"equal operator not supported between {self.__class__} and {other.__class__}" + ) + def as_nd_buffer(data: Any) -> NDBuffer: if isinstance(data, NDBuffer): diff --git a/src/zarr/store/local.py b/src/zarr/store/local.py index 059e122636..6e1f353f43 100644 --- a/src/zarr/store/local.py +++ b/src/zarr/store/local.py @@ -98,6 +98,11 @@ async def get_partial_values( async def set(self, key: str, value: Buffer) -> None: assert isinstance(key, str) + if isinstance(value, (bytes, bytearray)): + # TODO: to support the v2 tests, we convert bytes to Buffer here + value = as_buffer(value) + if not isinstance(value, Buffer): + raise TypeError("LocalStore.set(): `value` must a Buffer instance") path = self.root / key await to_thread(_put, path, value) diff --git a/src/zarr/store/memory.py b/src/zarr/store/memory.py index d496c3cb5f..93d2ace150 100644 --- a/src/zarr/store/memory.py +++ b/src/zarr/store/memory.py @@ -3,7 +3,7 @@ from typing import Optional, MutableMapping, List, Tuple from zarr.abc.store import Store -from zarr.buffer import Buffer +from zarr.buffer import Buffer, as_buffer # TODO: this store could easily be extended to wrap any MutuableMapping store from v2 @@ -48,6 +48,9 @@ async def set( self, key: str, value: Buffer, byte_range: Optional[Tuple[int, int]] = None ) -> None: assert isinstance(key, str) + if isinstance(value, (bytes, bytearray)): + # TODO: to support the v2 tests, we convert bytes to Buffer here + value = as_buffer(value) if not isinstance(value, Buffer): raise TypeError(f"Expected Buffer. Got {type(value)}.") From 45ad25471f4fcb1f50aa924cd2da44414337aaea Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 2 May 2024 09:37:44 +0200 Subject: [PATCH 05/45] clean up --- tests/v3/test_codecs.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/v3/test_codecs.py b/tests/v3/test_codecs.py index 9607ced37d..4f84f99436 100644 --- a/tests/v3/test_codecs.py +++ b/tests/v3/test_codecs.py @@ -296,7 +296,7 @@ async def test_order( fill_value=1, ) z[:, :] = data - assert as_bytearray(await (store / "order/0.0").get()) == z._store["0.0"] + assert (await (store / "order/0.0").get()) == z._store["0.0"] @pytest.mark.parametrize("input_order", ["F", "C"]) @@ -672,10 +672,10 @@ async def test_zarr_compat(store: Store): assert np.array_equal(data, await _AsyncArrayProxy(a)[:16, :18].get()) assert np.array_equal(data, z2[:16, :18]) - assert z2._store["0.0"] == as_bytearray(await (store / "zarr_compat3/0.0").get()) - assert z2._store["0.1"] == as_bytearray(await (store / "zarr_compat3/0.1").get()) - assert z2._store["1.0"] == as_bytearray(await (store / "zarr_compat3/1.0").get()) - assert z2._store["1.1"] == as_bytearray(await (store / "zarr_compat3/1.1").get()) + assert z2._store["0.0"] == await (store / "zarr_compat3/0.0").get() + assert z2._store["0.1"] == await (store / "zarr_compat3/0.1").get() + assert z2._store["1.0"] == await (store / "zarr_compat3/1.0").get() + assert z2._store["1.1"] == await (store / "zarr_compat3/1.1").get() @pytest.mark.asyncio @@ -706,10 +706,10 @@ async def test_zarr_compat_F(store: Store): assert np.array_equal(data, await _AsyncArrayProxy(a)[:16, :18].get()) assert np.array_equal(data, z2[:16, :18]) - assert z2._store["0.0"] == as_bytearray(await (store / "zarr_compatF3/0.0").get()) - assert z2._store["0.1"] == as_bytearray(await (store / "zarr_compatF3/0.1").get()) - assert z2._store["1.0"] == as_bytearray(await (store / "zarr_compatF3/1.0").get()) - assert z2._store["1.1"] == as_bytearray(await (store / "zarr_compatF3/1.1").get()) + assert z2._store["0.0"] == await (store / "zarr_compatF3/0.0").get() + assert z2._store["0.1"] == await (store / "zarr_compatF3/0.1").get() + assert z2._store["1.0"] == await (store / "zarr_compatF3/1.0").get() + assert z2._store["1.1"] == await (store / "zarr_compatF3/1.1").get() @pytest.mark.asyncio @@ -805,7 +805,7 @@ async def test_endian(store: Store, endian: Literal["big", "little"]): fill_value=1, ) z[:, :] = data - assert as_bytearray(await (store / "endian/0.0").get()) == z._store["0.0"] + assert await (store / "endian/0.0").get() == z._store["0.0"] @pytest.mark.parametrize("dtype_input_endian", [">u2", " Date: Mon, 6 May 2024 11:57:59 +0200 Subject: [PATCH 06/45] spilling --- src/zarr/store/memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/store/memory.py b/src/zarr/store/memory.py index 93d2ace150..c2e7cf3807 100644 --- a/src/zarr/store/memory.py +++ b/src/zarr/store/memory.py @@ -6,7 +6,7 @@ from zarr.buffer import Buffer, as_buffer -# TODO: this store could easily be extended to wrap any MutuableMapping store from v2 +# TODO: this store could easily be extended to wrap any MutableMapping store from v2 # When that is done, the `MemoryStore` will just be a store that wraps a dict. class MemoryStore(Store): supports_writes: bool = True From 71dcff1df83b18c186d15e35ad587e9dc6aac772 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 6 May 2024 12:03:28 +0200 Subject: [PATCH 07/45] remove return_as_bytes_wrapper --- src/zarr/buffer.py | 4 ---- src/zarr/codecs/blosc.py | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 68de0a63e7..1ef264d246 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -147,10 +147,6 @@ def as_bytes_wrapper(func: Callable[[bytes], bytes], buf: Buffer) -> Buffer: return as_buffer(func(buf.to_bytes())) -def return_as_bytes_wrapper(func: Callable[[Any], Any], *arg: Any, **kwargs: Any) -> Buffer: - return as_buffer(func(*arg, **kwargs)) - - def as_bytearray(data: Optional[Buffer]) -> Optional[bytes]: if data is None: return data diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 182eda3e8a..1e9d6ab153 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -9,7 +9,7 @@ from numcodecs.blosc import Blosc from zarr.abc.codec import BytesBytesCodec -from zarr.buffer import Buffer, as_bytes_wrapper, return_as_bytes_wrapper +from zarr.buffer import Buffer, as_buffer, as_bytes_wrapper from zarr.codecs.registry import register_codec from zarr.common import parse_enum, parse_named_configuration, to_thread @@ -174,7 +174,7 @@ async def encode( _runtime_configuration: RuntimeConfiguration, ) -> Optional[Buffer]: chunk_array = chunk_bytes.as_numpy_array(chunk_spec.dtype) - return await to_thread(return_as_bytes_wrapper, self._blosc_codec.encode, chunk_array) + return await to_thread(lambda: as_buffer(self._blosc_codec.encode(chunk_array))) def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError From 48edc4e4750623430e633042afe0079cdb6ab7c1 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 6 May 2024 12:06:30 +0200 Subject: [PATCH 08/45] remove as_ndarray --- src/zarr/buffer.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 1ef264d246..5b4275cf84 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -129,12 +129,6 @@ def as_nd_buffer(data: Any) -> NDBuffer: return NDBuffer(np.asanyarray(data)) -def as_ndarray(data: Optional[NDBuffer]) -> Optional[np.ndarray]: - if data is None: - return data - return data.as_numpy_array() - - def as_buffer(data: Any) -> Buffer: if isinstance(data, Buffer): return data From 5a83442ac7ee22661e1334e1161cfcb5c1a21381 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 6 May 2024 12:12:29 +0200 Subject: [PATCH 09/45] doc --- src/zarr/buffer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 5b4275cf84..14378dfd89 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -11,6 +11,13 @@ class NDBuffer: + """A n-dimensional memory block + + We use `NDBuffer` throughout Zarr to represent a block of memory. + For now, we only support host memory but the plan is to support other types + of memory such as CUDA device memory. + """ + def __init__(self, array: np.ndarray): assert isinstance(array, np.ndarray) assert array.dtype != object @@ -81,12 +88,7 @@ def transpose(self, *axes: np.SupportsIndex) -> Self: class Buffer(NDBuffer): - """Contiguous memory block - - We use `Buffer` throughout Zarr to represent a contiguous block of memory. - For now, we only support host memory but the plan is to support other types - of memory such as CUDA device memory. - """ + """A flat contiguous version of `NDBuffer` with an item size of 1""" @classmethod def create_empty( From e6d49f39f317c7bd54334ee57f7cc07527a424fc Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 6 May 2024 12:14:01 +0200 Subject: [PATCH 10/45] clean up --- src/zarr/buffer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 14378dfd89..04774963bb 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -66,7 +66,6 @@ def astype(self, dtype: np.DTypeLike, order: Literal["K", "A", "C", "F"] = "K") return self.__class__(self.as_numpy_array().astype(dtype=dtype, order=order)) def __getitem__(self, key: Any) -> Self: - # print("__getitem__: \n", np.asanyarray(self.as_numpy_array().__getitem__(key))) return self.__class__(np.asanyarray(self.as_numpy_array().__getitem__(key))) def __setitem__(self, key: Any, value: Any) -> None: From 009ad29c403420d31fc017e510cecc7529896bc8 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 8 May 2024 13:39:15 +0200 Subject: [PATCH 11/45] as_buffer(): handle bytes like --- src/zarr/buffer.py | 4 +++- src/zarr/group.py | 6 +++--- src/zarr/testing/store.py | 21 +++++++++++---------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 04774963bb..c753c85b5c 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -41,7 +41,7 @@ def as_numpy_array(self, dtype: Optional[np.DTypeLike] = None) -> np.ndarray: return self._data.astype(dtype=dtype, copy=False) @property - def dtype(self) -> np.DTypeLike: + def dtype(self) -> np.dtype[Any]: return self.as_numpy_array().dtype @property @@ -135,6 +135,8 @@ def as_buffer(data: Any) -> Buffer: return data if isinstance(data, NDBuffer): return Buffer(data.as_numpy_array()) + if isinstance(data, (bytes, bytearray, memoryview)): + return Buffer(np.frombuffer(data, dtype="b")) return Buffer(np.asanyarray(data)) diff --git a/src/zarr/group.py b/src/zarr/group.py index b7cf82a21d..852ab2c7ff 100644 --- a/src/zarr/group.py +++ b/src/zarr/group.py @@ -148,13 +148,13 @@ async def open( if zarr_format == 2: # V2 groups are comprised of a .zgroup and .zattrs objects assert zgroup_bytes is not None - zgroup = json.loads(zgroup_bytes) - zattrs = json.loads(zattrs_bytes) if zattrs_bytes is not None else {} + zgroup = json.loads(zgroup_bytes.to_bytes()) + zattrs = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {} group_metadata = {**zgroup, "attributes": zattrs} else: # V3 groups are comprised of a zarr.json object assert zarr_json_bytes is not None - group_metadata = json.loads(zarr_json_bytes) + group_metadata = json.loads(zarr_json_bytes.to_bytes()) return cls.from_dict(store_path, group_metadata, runtime_configuration) diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index 601ef7f393..48f2ce46b0 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -1,6 +1,7 @@ import pytest from zarr.abc.store import Store +from zarr.buffer import as_buffer class StoreTests: @@ -25,14 +26,14 @@ def test_store_capabilities(self, store: Store) -> None: @pytest.mark.parametrize("key", ["c/0", "foo/c/0.0", "foo/0/0"]) @pytest.mark.parametrize("data", [b"\x01\x02\x03\x04", b""]) async def test_set_get_bytes_roundtrip(self, store: Store, key: str, data: bytes) -> None: - await store.set(key, data) + await store.set(key, as_buffer(data)) assert await store.get(key) == data @pytest.mark.parametrize("key", ["foo/c/0"]) @pytest.mark.parametrize("data", [b"\x01\x02\x03\x04", b""]) async def test_get_partial_values(self, store: Store, key: str, data: bytes) -> None: # put all of the data - await store.set(key, data) + await store.set(key, as_buffer(data)) # read back just part of it vals = await store.get_partial_values([(key, (0, 2))]) assert vals == [data[0:2]] @@ -43,18 +44,18 @@ async def test_get_partial_values(self, store: Store, key: str, data: bytes) -> async def test_exists(self, store: Store) -> None: assert not await store.exists("foo") - await store.set("foo/zarr.json", b"bar") + await store.set("foo/zarr.json", as_buffer(b"bar")) assert await store.exists("foo/zarr.json") async def test_delete(self, store: Store) -> None: - await store.set("foo/zarr.json", b"bar") + await store.set("foo/zarr.json", as_buffer(b"bar")) assert await store.exists("foo/zarr.json") await store.delete("foo/zarr.json") assert not await store.exists("foo/zarr.json") async def test_list(self, store: Store) -> None: assert [k async for k in store.list()] == [] - await store.set("foo/zarr.json", b"bar") + await store.set("foo/zarr.json", as_buffer(b"bar")) keys = [k async for k in store.list()] assert keys == ["foo/zarr.json"], keys @@ -62,7 +63,7 @@ async def test_list(self, store: Store) -> None: for i in range(10): key = f"foo/c/{i}" expected.append(key) - await store.set(f"foo/c/{i}", i.to_bytes(length=3, byteorder="little")) + await store.set(f"foo/c/{i}", as_buffer(i.to_bytes(length=3, byteorder="little"))) async def test_list_prefix(self, store: Store) -> None: # TODO: we currently don't use list_prefix anywhere @@ -71,11 +72,11 @@ async def test_list_prefix(self, store: Store) -> None: async def test_list_dir(self, store: Store) -> None: assert [k async for k in store.list_dir("")] == [] assert [k async for k in store.list_dir("foo")] == [] - await store.set("foo/zarr.json", b"bar") - await store.set("foo/c/1", b"\x01") + await store.set("foo/zarr.json", as_buffer(b"bar")) + await store.set("foo/c/1", as_buffer(b"\x01")) keys = [k async for k in store.list_dir("foo")] - assert keys == ["zarr.json", "c"], keys + assert set(keys) == set(["zarr.json", "c"]), keys keys = [k async for k in store.list_dir("foo/")] - assert keys == ["zarr.json", "c"], keys + assert set(keys) == set(["zarr.json", "c"]), keys From c189a4f14bb49a27b9fc981c4b88367351b5c52c Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 10:58:05 +0200 Subject: [PATCH 12/45] removed sync.py again --- src/zarr/v3/sync.py | 131 -------------------------------------------- 1 file changed, 131 deletions(-) delete mode 100644 src/zarr/v3/sync.py diff --git a/src/zarr/v3/sync.py b/src/zarr/v3/sync.py deleted file mode 100644 index a152030e89..0000000000 --- a/src/zarr/v3/sync.py +++ /dev/null @@ -1,131 +0,0 @@ -from __future__ import annotations -from typing import TYPE_CHECKING, TypeVar - -if TYPE_CHECKING: - from typing import Any, AsyncIterator, Coroutine - -import asyncio -from concurrent.futures import wait -import threading - -from typing_extensions import ParamSpec - -from zarr.config import SyncConfiguration - -P = ParamSpec("P") -T = TypeVar("T") - -# From https://github.com/fsspec/filesystem_spec/blob/master/fsspec/asyn.py - -iothread: list[threading.Thread | None] = [None] # dedicated IO thread -loop: list[asyncio.AbstractEventLoop | None] = [ - None -] # global event loop for any non-async instance -_lock: threading.Lock | None = None # global lock placeholder -get_running_loop = asyncio.get_running_loop - - -class SyncError(Exception): - pass - - -def _get_lock() -> threading.Lock: - """Allocate or return a threading lock. - - The lock is allocated on first use to allow setting one lock per forked process. - """ - global _lock - if not _lock: - _lock = threading.Lock() - return _lock - - -async def _runner(coro: Coroutine[Any, Any, T]) -> T | BaseException: - """ - Await a coroutine and return the result of running it. If awaiting the coroutine raises an - exception, the exception will be returned. - """ - try: - return await coro - except Exception as ex: - return ex - - -def sync( - coro: Coroutine[Any, Any, T], - loop: asyncio.AbstractEventLoop | None = None, - timeout: float | None = None, -) -> T: - """ - Make loop run coroutine until it returns. Runs in other thread - - Examples - -------- - >>> sync(async_function(), existing_loop) - """ - if loop is None: - # NB: if the loop is not running *yet*, it is OK to submit work - # and we will wait for it - loop = _get_loop() - if not isinstance(loop, asyncio.AbstractEventLoop): - raise TypeError(f"loop cannot be of type {type(loop)}") - if loop.is_closed(): - raise RuntimeError("Loop is not running") - try: - loop0 = asyncio.events.get_running_loop() - if loop0 is loop: - raise SyncError("Calling sync() from within a running loop") - except RuntimeError: - pass - - future = asyncio.run_coroutine_threadsafe(_runner(coro), loop) - - finished, unfinished = wait([future], return_when=asyncio.ALL_COMPLETED, timeout=timeout) - if len(unfinished) > 0: - raise asyncio.TimeoutError(f"Coroutine {coro} failed to finish in within {timeout}s") - assert len(finished) == 1 - return_result = list(finished)[0].result() - - if isinstance(return_result, BaseException): - raise return_result - else: - return return_result - - -def _get_loop() -> asyncio.AbstractEventLoop: - """Create or return the default fsspec IO loop - - The loop will be running on a separate thread. - """ - if loop[0] is None: - with _get_lock(): - # repeat the check just in case the loop got filled between the - # previous two calls from another thread - if loop[0] is None: - new_loop = asyncio.new_event_loop() - loop[0] = new_loop - th = threading.Thread(target=new_loop.run_forever, name="zarrIO") - th.daemon = True - th.start() - iothread[0] = th - assert loop[0] is not None - return loop[0] - - -class SyncMixin: - _sync_configuration: SyncConfiguration - - def _sync(self, coroutine: Coroutine[Any, Any, T]) -> T: - # TODO: refactor this to to take *args and **kwargs and pass those to the method - # this should allow us to better type the sync wrapper - return sync( - coroutine, - loop=self._sync_configuration.asyncio_loop, - timeout=self._sync_configuration.timeout, - ) - - def _sync_iter(self, async_iterator: AsyncIterator[T]) -> list[T]: - async def iter_to_list() -> list[T]: - return [item async for item in async_iterator] - - return self._sync(iter_to_list()) From 7cb9346780c40786f07778ddbb12d7c83e41eecc Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 12:27:36 +0200 Subject: [PATCH 13/45] separate Buffer and NNBuffer --- src/zarr/buffer.py | 75 +++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index c753c85b5c..1a2419ab69 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -10,6 +10,43 @@ from zarr.codecs.bytes import Endian +class Buffer: + """A flat contiguous version of `NDBuffer` with an item size of 1""" + + def __init__(self, array: np.ndarray): + assert isinstance(array, np.ndarray) + assert array.dtype != object + self._data = array + + def memoryview(self) -> memoryview: + return memoryview(self._data.reshape(-1).view(dtype="b")) + + def as_numpy_array(self, dtype: Optional[np.DTypeLike] = "b") -> np.ndarray: + return self._data.reshape(-1).view(dtype=dtype) + + def to_bytes(self) -> bytes: + return bytes(self.memoryview()) + + def __getitem__(self, key: Any) -> Self: + return self.__class__(self.as_numpy_array().__getitem__(key)) + + def __setitem__(self, key: Any, value: Any) -> None: + self.as_numpy_array().__setitem__(key, value) + + def __len__(self) -> int: + return self._data.nbytes + + def __add__(self, other: Buffer) -> Self: + return self.__class__(np.frombuffer(self.to_bytes() + other.to_bytes(), dtype="b")) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, (bytes, bytearray)): + return self.to_bytes() == other + raise ValueError( + f"equal operator not supported between {self.__class__} and {other.__class__}" + ) + + class NDBuffer: """A n-dimensional memory block @@ -86,44 +123,6 @@ def transpose(self, *axes: np.SupportsIndex) -> Self: return self.__class__(self.as_numpy_array().transpose(*axes)) -class Buffer(NDBuffer): - """A flat contiguous version of `NDBuffer` with an item size of 1""" - - @classmethod - def create_empty( - cls, *, shape: Iterable[int], dtype: np.DTypeLike = "b", order: Literal["C", "F"] = "C" - ) -> Self: - return cls(np.empty(shape=shape, dtype=dtype, order=order)) - - def memoryview(self) -> memoryview: - return memoryview(self._data.reshape(-1).view(dtype="b")) - - def as_numpy_array(self, dtype: Optional[np.DTypeLike] = "b") -> np.ndarray: - return self._data.reshape(-1).view(dtype=dtype) - - def to_bytes(self) -> bytes: - return bytes(self.memoryview()) - - def __getitem__(self, key: Any) -> Self: - return self.__class__(self.as_numpy_array().__getitem__(key)) - - def __setitem__(self, key: Any, value: Any) -> None: - self.as_numpy_array().__setitem__(key, value) - - def __len__(self) -> int: - return self._data.nbytes - - def __add__(self, other: Buffer) -> Self: - return self.__class__(np.frombuffer(self.to_bytes() + other.to_bytes(), dtype="b")) - - def __eq__(self, other: Any) -> bool: - if isinstance(other, (bytes, bytearray)): - return self.to_bytes() == other - raise ValueError( - f"equal operator not supported between {self.__class__} and {other.__class__}" - ) - - def as_nd_buffer(data: Any) -> NDBuffer: if isinstance(data, NDBuffer): return data From 2ba8510fbbfcaf26e977a893fd9d61befd8a27e2 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 13:19:12 +0200 Subject: [PATCH 14/45] impl. NDBuffer.from_numpy_array() --- src/zarr/array.py | 4 ++-- src/zarr/buffer.py | 10 ++++------ src/zarr/codecs/sharding.py | 4 ++-- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index 6d606a14ee..8b6f3312d6 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -20,7 +20,7 @@ # from zarr.array_v2 import ArrayV2 -from zarr.buffer import NDBuffer, as_buffer, as_nd_buffer +from zarr.buffer import NDBuffer, as_buffer from zarr.codecs import BytesCodec from zarr.common import ( ZARR_JSON, @@ -269,7 +269,7 @@ async def setitem(self, selection: Selection, value: npt.NDArray[Any]) -> None: # We accept a numpy array as input from the user and convert it to a NDBuffer. # From this point onwards, we only pass Buffer and NDBuffer between components. - value = as_nd_buffer(value) + value = NDBuffer.from_numpy_array(value) # merging with existing data and encoding chunks await concurrent_map( diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 1a2419ab69..727ad79b96 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -60,6 +60,10 @@ def __init__(self, array: np.ndarray): assert array.dtype != object self._data = array + @classmethod + def from_numpy_array(cls, array: np.ArrayLike) -> Self: + return cls(np.asanyarray(array)) + @classmethod def create_empty( cls, *, shape: Iterable[int], dtype: np.DTypeLike, order: Literal["C", "F"] = "C" @@ -123,12 +127,6 @@ def transpose(self, *axes: np.SupportsIndex) -> Self: return self.__class__(self.as_numpy_array().transpose(*axes)) -def as_nd_buffer(data: Any) -> NDBuffer: - if isinstance(data, NDBuffer): - return data - return NDBuffer(np.asanyarray(data)) - - def as_buffer(data: Any) -> Buffer: if isinstance(data, Buffer): return data diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 966033d63a..b16a35e66f 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -37,7 +37,7 @@ ArrayMetadata, parse_codecs, ) -from zarr.buffer import Buffer, NDBuffer, as_buffer, as_nd_buffer +from zarr.buffer import Buffer, NDBuffer, as_buffer if TYPE_CHECKING: from typing import Awaitable, Callable, Dict, Iterator, List, Optional, Set, Tuple @@ -603,7 +603,7 @@ async def _decode_shard_index( async def _encode_shard_index(self, index: _ShardIndex) -> Buffer: index_bytes = await self.index_codecs.encode( - as_nd_buffer(index.offsets_and_lengths), + NDBuffer.from_numpy_array(index.offsets_and_lengths), self._get_index_chunk_spec(index.chunks_per_shard), ) assert index_bytes is not None From fccd95664c0c513bc53a865d02668a1573f267d4 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 14:12:23 +0200 Subject: [PATCH 15/45] remove as_buffer() --- src/zarr/array.py | 8 ++++---- src/zarr/array_v2.py | 12 +++++++----- src/zarr/buffer.py | 34 +++++++++++++++++++--------------- src/zarr/codecs/blosc.py | 10 +++++++--- src/zarr/codecs/bytes.py | 4 ++-- src/zarr/codecs/crc32c_.py | 6 +++--- src/zarr/codecs/sharding.py | 6 +++--- src/zarr/group.py | 10 +++++----- src/zarr/metadata.py | 4 ++-- src/zarr/store/local.py | 10 +++++----- src/zarr/store/memory.py | 4 ++-- src/zarr/testing/store.py | 20 +++++++++++--------- tests/v3/test_group.py | 4 ++-- 13 files changed, 72 insertions(+), 60 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index 8b6f3312d6..8f5be1b223 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -20,7 +20,7 @@ # from zarr.array_v2 import ArrayV2 -from zarr.buffer import NDBuffer, as_buffer +from zarr.buffer import Buffer, NDBuffer from zarr.codecs import BytesCodec from zarr.common import ( ZARR_JSON, @@ -216,7 +216,7 @@ async def getitem(self, selection: Selection) -> npt.NDArray[Any]: return out.as_numpy_array()[()] async def _save_metadata(self) -> None: - await (self.store_path / ZARR_JSON).set(as_buffer(self.metadata.to_bytes())) + await (self.store_path / ZARR_JSON).set(Buffer.from_bytes(self.metadata.to_bytes())) async def _read_chunk( self, @@ -382,14 +382,14 @@ async def _delete_key(key: str) -> None: ) # Write new metadata - await (self.store_path / ZARR_JSON).set(as_buffer(new_metadata.to_bytes())) + await (self.store_path / ZARR_JSON).set(Buffer.from_bytes(new_metadata.to_bytes())) return replace(self, metadata=new_metadata) async def update_attributes(self, new_attributes: Dict[str, Any]) -> AsyncArray: new_metadata = replace(self.metadata, attributes=new_attributes) # Write new metadata - await (self.store_path / ZARR_JSON).set(as_buffer(new_metadata.to_bytes())) + await (self.store_path / ZARR_JSON).set(Buffer.from_bytes(new_metadata.to_bytes())) return replace(self, metadata=new_metadata) def __repr__(self): diff --git a/src/zarr/array_v2.py b/src/zarr/array_v2.py index 4c6576d3ed..2254b0e7c3 100644 --- a/src/zarr/array_v2.py +++ b/src/zarr/array_v2.py @@ -10,7 +10,7 @@ from numcodecs.compat import ensure_bytes, ensure_ndarray -from zarr.buffer import NDBuffer, as_buffer, as_bytearray +from zarr.buffer import Buffer, NDBuffer, as_bytearray from zarr.common import ( ZARRAY_JSON, ZATTRS_JSON, @@ -180,7 +180,7 @@ async def _save_metadata(self) -> None: await (self.store_path / ZARRAY_JSON).set(self.metadata.to_bytes()) if self.attributes is not None and len(self.attributes) > 0: await (self.store_path / ZATTRS_JSON).set( - as_buffer(json.dumps(self.attributes).encode()), + Buffer.from_bytes(json.dumps(self.attributes).encode()), ) else: await (self.store_path / ZATTRS_JSON).delete() @@ -375,7 +375,7 @@ async def _write_chunk_to_store(self, store_path: StorePath, chunk_array: np.nda if chunk_bytes is None: await store_path.delete() else: - await store_path.set(as_buffer(chunk_bytes)) + await store_path.set(Buffer.from_bytes(chunk_bytes)) async def _encode_chunk(self, chunk_array: np.ndarray) -> Optional[BytesLike]: chunk_array = chunk_array.ravel(order=self.metadata.order) @@ -494,7 +494,7 @@ async def convert_to_v3_async(self) -> Array: ) new_metadata_bytes = new_metadata.to_bytes() - await (self.store_path / ZARR_JSON).set(as_buffer(new_metadata_bytes)) + await (self.store_path / ZARR_JSON).set(Buffer.from_bytes(new_metadata_bytes)) return Array.from_dict( store_path=self.store_path, @@ -502,7 +502,9 @@ async def convert_to_v3_async(self) -> Array: ) async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> ArrayV2: - await (self.store_path / ZATTRS_JSON).set(as_buffer(json.dumps(new_attributes).encode())) + await (self.store_path / ZATTRS_JSON).set( + Buffer.from_bytes(json.dumps(new_attributes).encode()) + ) return replace(self, attributes=new_attributes) def update_attributes(self, new_attributes: Dict[str, Any]) -> ArrayV2: diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 727ad79b96..b8d7ba93b5 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -4,6 +4,8 @@ from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, Tuple import numpy as np +from zarr.common import BytesLike + if TYPE_CHECKING: from typing_extensions import Self @@ -15,18 +17,30 @@ class Buffer: def __init__(self, array: np.ndarray): assert isinstance(array, np.ndarray) - assert array.dtype != object + self._data = array + @classmethod + def create_empty(cls, *, nbytes: int) -> Self: + return cls(np.empty(shape=(nbytes,), dtype="b")) + + @classmethod + def from_bytes(cls, data: BytesLike) -> Self: + return cls(np.frombuffer(data, dtype="b")) + + @classmethod + def from_nd_buffer(cls, nd_buffer: NDBuffer) -> Self: + return cls(np.frombuffer(nd_buffer.as_numpy_array().reshape(-1), dtype="b")) + + def to_bytes(self) -> bytes: + return bytes(self.memoryview()) + def memoryview(self) -> memoryview: return memoryview(self._data.reshape(-1).view(dtype="b")) def as_numpy_array(self, dtype: Optional[np.DTypeLike] = "b") -> np.ndarray: return self._data.reshape(-1).view(dtype=dtype) - def to_bytes(self) -> bytes: - return bytes(self.memoryview()) - def __getitem__(self, key: Any) -> Self: return self.__class__(self.as_numpy_array().__getitem__(key)) @@ -127,18 +141,8 @@ def transpose(self, *axes: np.SupportsIndex) -> Self: return self.__class__(self.as_numpy_array().transpose(*axes)) -def as_buffer(data: Any) -> Buffer: - if isinstance(data, Buffer): - return data - if isinstance(data, NDBuffer): - return Buffer(data.as_numpy_array()) - if isinstance(data, (bytes, bytearray, memoryview)): - return Buffer(np.frombuffer(data, dtype="b")) - return Buffer(np.asanyarray(data)) - - def as_bytes_wrapper(func: Callable[[bytes], bytes], buf: Buffer) -> Buffer: - return as_buffer(func(buf.to_bytes())) + return Buffer.from_bytes(func(buf.to_bytes())) def as_bytearray(data: Optional[Buffer]) -> Optional[bytes]: diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 890c6e172e..ff52dba061 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -9,7 +9,7 @@ from numcodecs.blosc import Blosc from zarr.abc.codec import BytesBytesCodec -from zarr.buffer import Buffer, as_buffer, as_bytes_wrapper +from zarr.buffer import Buffer, as_bytes_wrapper from zarr.codecs.registry import register_codec from zarr.common import parse_enum, parse_named_configuration, to_thread @@ -170,8 +170,12 @@ async def encode( chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> Optional[Buffer]: - chunk_array = chunk_bytes.as_numpy_array(chunk_spec.dtype) - return await to_thread(lambda: as_buffer(self._blosc_codec.encode(chunk_array))) + # Since blosc only takes bytes, we convert the input and output of the encoding + # between bytes and Buffer + return await to_thread( + lambda chunk: Buffer.from_bytes(self._blosc_codec.encode(chunk.memoryview())), + chunk_bytes, + ) def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index e6c44fee21..3a18442bab 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -8,7 +8,7 @@ import numpy as np from zarr.abc.codec import ArrayBytesCodec -from zarr.buffer import Buffer, NDBuffer, as_buffer +from zarr.buffer import Buffer, NDBuffer from zarr.codecs.registry import register_codec from zarr.common import parse_enum, parse_named_configuration @@ -93,7 +93,7 @@ async def encode( if self.endian is not None and self.endian != chunk_array.byteorder: new_dtype = chunk_array.dtype.newbyteorder(self.endian.name) chunk_array = chunk_array.astype(new_dtype) - return as_buffer(chunk_array) + return Buffer.from_nd_buffer(chunk_array) def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index 7a37a3353a..ac389c1406 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -8,7 +8,7 @@ from crc32c import crc32c from zarr.abc.codec import BytesBytesCodec -from zarr.buffer import Buffer, as_buffer +from zarr.buffer import Buffer from zarr.codecs.registry import register_codec from zarr.common import parse_named_configuration @@ -46,7 +46,7 @@ async def decode( "Stored and computed checksum do not match. " + f"Stored: {stored_checksum!r}. Computed: {computed_checksum!r}." ) - return as_buffer(inner_bytes) + return Buffer.from_bytes(inner_bytes) async def encode( self, @@ -54,7 +54,7 @@ async def encode( _chunk_spec: ArraySpec, ) -> Optional[Buffer]: checksum = crc32c(chunk_bytes.memoryview()) - return as_buffer(chunk_bytes.to_bytes() + np.uint32(checksum).tobytes()) + return Buffer.from_bytes(chunk_bytes.to_bytes() + np.uint32(checksum).tobytes()) def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length + 4 diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index b16a35e66f..8aaab78703 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -37,7 +37,7 @@ ArrayMetadata, parse_codecs, ) -from zarr.buffer import Buffer, NDBuffer, as_buffer +from zarr.buffer import Buffer, NDBuffer if TYPE_CHECKING: from typing import Awaitable, Callable, Dict, Iterator, List, Optional, Set, Tuple @@ -148,7 +148,7 @@ async def from_bytes( def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardProxy: index = _ShardIndex.create_empty(chunks_per_shard) obj = cls() - obj.buf = as_buffer(np.array([], dtype="b")) + obj.buf = Buffer.create_empty(nbytes=0) obj.index = index return obj @@ -190,7 +190,7 @@ def merge_with_morton_order( @classmethod def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardBuilder: obj = cls() - obj.buf = as_buffer(np.array([], dtype="b")) + obj.buf = Buffer.create_empty(nbytes=0) obj.index = _ShardIndex.create_empty(chunks_per_shard) return obj diff --git a/src/zarr/group.py b/src/zarr/group.py index 2b1636a3f6..449c6d1333 100644 --- a/src/zarr/group.py +++ b/src/zarr/group.py @@ -6,7 +6,7 @@ import json import logging -from zarr.buffer import as_buffer +from zarr.buffer import Buffer if TYPE_CHECKING: from typing import ( @@ -240,7 +240,7 @@ async def delitem(self, key: str) -> None: async def _save_metadata(self) -> None: to_save = self.metadata.to_bytes() awaitables = [ - (self.store_path / key).set(as_buffer(value)) for key, value in to_save.items() + (self.store_path / key).set(Buffer.from_bytes(value)) for key, value in to_save.items() ] await asyncio.gather(*awaitables) @@ -273,9 +273,9 @@ async def update_attributes(self, new_attributes: dict[str, Any]): to_save = self.metadata.to_bytes() if self.metadata.zarr_format == 2: # only save the .zattrs object - await (self.store_path / ZATTRS_JSON).set(as_buffer(to_save[ZATTRS_JSON])) + await (self.store_path / ZATTRS_JSON).set(Buffer.from_bytes(to_save[ZATTRS_JSON])) else: - await (self.store_path / ZARR_JSON).set(as_buffer(to_save[ZARR_JSON])) + await (self.store_path / ZARR_JSON).set(Buffer.from_bytes(to_save[ZARR_JSON])) self.metadata.attributes.clear() self.metadata.attributes.update(new_attributes) @@ -444,7 +444,7 @@ async def update_attributes_async(self, new_attributes: dict[str, Any]) -> Group # Write new metadata to_save = new_metadata.to_bytes() awaitables = [ - (self.store_path / key).set(as_buffer(value)) for key, value in to_save.items() + (self.store_path / key).set(Buffer.from_bytes(value)) for key, value in to_save.items() ] await asyncio.gather(*awaitables) diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py index 7a49e330dc..098ab34b86 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata.py @@ -6,7 +6,7 @@ import numpy as np import numpy.typing as npt -from zarr.buffer import Buffer, as_buffer +from zarr.buffer import Buffer from zarr.chunk_grids import ChunkGrid, RegularChunkGrid from zarr.chunk_key_encodings import ChunkKeyEncoding, parse_separator @@ -299,7 +299,7 @@ def _json_convert(o): return o.descr raise TypeError - return as_buffer(json.dumps(self.to_dict(), default=_json_convert).encode()) + return Buffer.from_bytes(json.dumps(self.to_dict(), default=_json_convert).encode()) @classmethod def from_dict(cls, data: Dict[str, Any]) -> ArrayV2Metadata: diff --git a/src/zarr/store/local.py b/src/zarr/store/local.py index bfd8dd0c3b..37a9a5b8f5 100644 --- a/src/zarr/store/local.py +++ b/src/zarr/store/local.py @@ -7,7 +7,7 @@ from typing import Union, Optional, List, Tuple from zarr.abc.store import Store -from zarr.buffer import Buffer, as_buffer +from zarr.buffer import Buffer from zarr.common import concurrent_map, to_thread @@ -32,7 +32,7 @@ def _get(path: Path, byte_range: Optional[Tuple[int, Optional[int]]] = None) -> end = (start + byte_range[1]) if byte_range[1] is not None else None else: - return as_buffer(path.read_bytes()) + return Buffer.from_bytes(path.read_bytes()) with path.open("rb") as f: size = f.seek(0, io.SEEK_END) if start is not None: @@ -43,8 +43,8 @@ def _get(path: Path, byte_range: Optional[Tuple[int, Optional[int]]] = None) -> if end is not None: if end < 0: end = size + end - return as_buffer(f.read(end - f.tell())) - return as_buffer(f.read()) + return Buffer.from_bytes(f.read(end - f.tell())) + return Buffer.from_bytes(f.read()) def _put( @@ -124,7 +124,7 @@ async def set(self, key: str, value: Buffer) -> None: assert isinstance(key, str) if isinstance(value, (bytes, bytearray)): # TODO: to support the v2 tests, we convert bytes to Buffer here - value = as_buffer(value) + value = Buffer.from_bytes(value) if not isinstance(value, Buffer): raise TypeError("LocalStore.set(): `value` must a Buffer instance") path = self.root / key diff --git a/src/zarr/store/memory.py b/src/zarr/store/memory.py index 2b86578b32..1caba5acc1 100644 --- a/src/zarr/store/memory.py +++ b/src/zarr/store/memory.py @@ -5,7 +5,7 @@ from zarr.common import concurrent_map from zarr.abc.store import Store -from zarr.buffer import Buffer, as_buffer +from zarr.buffer import Buffer # TODO: this store could easily be extended to wrap any MutableMapping store from v2 @@ -53,7 +53,7 @@ async def set( assert isinstance(key, str) if isinstance(value, (bytes, bytearray)): # TODO: to support the v2 tests, we convert bytes to Buffer here - value = as_buffer(value) + value = Buffer.from_bytes(value) if not isinstance(value, Buffer): raise TypeError(f"Expected Buffer. Got {type(value)}.") diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index 48f2ce46b0..99f8021594 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -1,7 +1,7 @@ import pytest from zarr.abc.store import Store -from zarr.buffer import as_buffer +from zarr.buffer import Buffer class StoreTests: @@ -26,14 +26,14 @@ def test_store_capabilities(self, store: Store) -> None: @pytest.mark.parametrize("key", ["c/0", "foo/c/0.0", "foo/0/0"]) @pytest.mark.parametrize("data", [b"\x01\x02\x03\x04", b""]) async def test_set_get_bytes_roundtrip(self, store: Store, key: str, data: bytes) -> None: - await store.set(key, as_buffer(data)) + await store.set(key, Buffer.from_bytes(data)) assert await store.get(key) == data @pytest.mark.parametrize("key", ["foo/c/0"]) @pytest.mark.parametrize("data", [b"\x01\x02\x03\x04", b""]) async def test_get_partial_values(self, store: Store, key: str, data: bytes) -> None: # put all of the data - await store.set(key, as_buffer(data)) + await store.set(key, Buffer.from_bytes(data)) # read back just part of it vals = await store.get_partial_values([(key, (0, 2))]) assert vals == [data[0:2]] @@ -44,18 +44,18 @@ async def test_get_partial_values(self, store: Store, key: str, data: bytes) -> async def test_exists(self, store: Store) -> None: assert not await store.exists("foo") - await store.set("foo/zarr.json", as_buffer(b"bar")) + await store.set("foo/zarr.json", Buffer.from_bytes(b"bar")) assert await store.exists("foo/zarr.json") async def test_delete(self, store: Store) -> None: - await store.set("foo/zarr.json", as_buffer(b"bar")) + await store.set("foo/zarr.json", Buffer.from_bytes(b"bar")) assert await store.exists("foo/zarr.json") await store.delete("foo/zarr.json") assert not await store.exists("foo/zarr.json") async def test_list(self, store: Store) -> None: assert [k async for k in store.list()] == [] - await store.set("foo/zarr.json", as_buffer(b"bar")) + await store.set("foo/zarr.json", Buffer.from_bytes(b"bar")) keys = [k async for k in store.list()] assert keys == ["foo/zarr.json"], keys @@ -63,7 +63,9 @@ async def test_list(self, store: Store) -> None: for i in range(10): key = f"foo/c/{i}" expected.append(key) - await store.set(f"foo/c/{i}", as_buffer(i.to_bytes(length=3, byteorder="little"))) + await store.set( + f"foo/c/{i}", Buffer.from_bytes(i.to_bytes(length=3, byteorder="little")) + ) async def test_list_prefix(self, store: Store) -> None: # TODO: we currently don't use list_prefix anywhere @@ -72,8 +74,8 @@ async def test_list_prefix(self, store: Store) -> None: async def test_list_dir(self, store: Store) -> None: assert [k async for k in store.list_dir("")] == [] assert [k async for k in store.list_dir("foo")] == [] - await store.set("foo/zarr.json", as_buffer(b"bar")) - await store.set("foo/c/1", as_buffer(b"\x01")) + await store.set("foo/zarr.json", Buffer.from_bytes(b"bar")) + await store.set("foo/c/1", Buffer.from_bytes(b"\x01")) keys = [k async for k in store.list_dir("foo")] assert set(keys) == set(["zarr.json", "c"]), keys diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index fd1f4e5b27..c94ec87e2f 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -1,7 +1,7 @@ from __future__ import annotations from typing import TYPE_CHECKING -from zarr.buffer import as_buffer +from zarr.buffer import Buffer from zarr.sync import sync if TYPE_CHECKING: @@ -41,7 +41,7 @@ def test_group_members(store_type, request): # add an extra object to the domain of the group. # the list of children should ignore this object. - sync(store.set(f"{path}/extra_object-1", as_buffer(b"000000"))) + sync(store.set(f"{path}/extra_object-1", Buffer.from_bytes(b"000000"))) # add an extra object under a directory-like prefix in the domain of the group. # this creates a directory with a random key in it # this should not show up as a member From 962d729f1dfa361ff04e64eacc7a695525897b9f Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 14:34:12 +0200 Subject: [PATCH 16/45] remove Buffer.as_numpy_array() --- src/zarr/buffer.py | 24 +++++++++++++----------- src/zarr/codecs/bytes.py | 2 +- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index b8d7ba93b5..17413b16f0 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -17,7 +17,9 @@ class Buffer: def __init__(self, array: np.ndarray): assert isinstance(array, np.ndarray) - + assert array.ndim == 1 + assert array.itemsize == 1 + assert array.dtype == np.dtype("b") self._data = array @classmethod @@ -32,20 +34,20 @@ def from_bytes(cls, data: BytesLike) -> Self: def from_nd_buffer(cls, nd_buffer: NDBuffer) -> Self: return cls(np.frombuffer(nd_buffer.as_numpy_array().reshape(-1), dtype="b")) + def as_nd_buffer(self, *, dtype: np.DTypeLike) -> NDBuffer: + return NDBuffer(self._data.view(dtype=dtype)) + def to_bytes(self) -> bytes: return bytes(self.memoryview()) def memoryview(self) -> memoryview: - return memoryview(self._data.reshape(-1).view(dtype="b")) - - def as_numpy_array(self, dtype: Optional[np.DTypeLike] = "b") -> np.ndarray: - return self._data.reshape(-1).view(dtype=dtype) + return memoryview(self._data) def __getitem__(self, key: Any) -> Self: - return self.__class__(self.as_numpy_array().__getitem__(key)) + return self.__class__(self._data.__getitem__(key)) def __setitem__(self, key: Any, value: Any) -> None: - self.as_numpy_array().__setitem__(key, value) + self._data.__setitem__(key, value) def __len__(self) -> int: return self._data.nbytes @@ -74,10 +76,6 @@ def __init__(self, array: np.ndarray): assert array.dtype != object self._data = array - @classmethod - def from_numpy_array(cls, array: np.ArrayLike) -> Self: - return cls(np.asanyarray(array)) - @classmethod def create_empty( cls, *, shape: Iterable[int], dtype: np.DTypeLike, order: Literal["C", "F"] = "C" @@ -90,6 +88,10 @@ def create_zeros( ) -> Self: return cls(np.zeros(shape=shape, dtype=dtype, order=order)) + @classmethod + def from_numpy_array(cls, array: np.ArrayLike) -> Self: + return cls(np.asanyarray(array)) + def as_numpy_array(self, dtype: Optional[np.DTypeLike] = None) -> np.ndarray: if dtype is None: return self._data diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 3a18442bab..4d3ee5469a 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -74,7 +74,7 @@ async def decode( dtype = np.dtype(f"{prefix}{chunk_spec.dtype.str[1:]}") else: dtype = np.dtype(f"|{chunk_spec.dtype.str[1:]}") - chunk_array = NDBuffer(chunk_bytes.as_numpy_array(dtype)) + chunk_array = chunk_bytes.as_nd_buffer(dtype=dtype) # ensure correct chunk shape if chunk_array.shape != chunk_spec.shape: From 12de6c2a99ca16f4b6b67000f8986f214f60d4fa Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 14:42:25 +0200 Subject: [PATCH 17/45] impl. NDBuffer.as_buffer() --- src/zarr/buffer.py | 7 +++---- src/zarr/codecs/bytes.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 17413b16f0..ccd0be3525 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -30,10 +30,6 @@ def create_empty(cls, *, nbytes: int) -> Self: def from_bytes(cls, data: BytesLike) -> Self: return cls(np.frombuffer(data, dtype="b")) - @classmethod - def from_nd_buffer(cls, nd_buffer: NDBuffer) -> Self: - return cls(np.frombuffer(nd_buffer.as_numpy_array().reshape(-1), dtype="b")) - def as_nd_buffer(self, *, dtype: np.DTypeLike) -> NDBuffer: return NDBuffer(self._data.view(dtype=dtype)) @@ -92,6 +88,9 @@ def create_zeros( def from_numpy_array(cls, array: np.ArrayLike) -> Self: return cls(np.asanyarray(array)) + def as_buffer(self) -> Buffer: + return Buffer(np.frombuffer(self.as_numpy_array().reshape(-1), dtype="b")) + def as_numpy_array(self, dtype: Optional[np.DTypeLike] = None) -> np.ndarray: if dtype is None: return self._data diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 4d3ee5469a..d6a626e160 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -93,7 +93,7 @@ async def encode( if self.endian is not None and self.endian != chunk_array.byteorder: new_dtype = chunk_array.dtype.newbyteorder(self.endian.name) chunk_array = chunk_array.astype(new_dtype) - return Buffer.from_nd_buffer(chunk_array) + return chunk_array.as_buffer() def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length From 36a0d98be9588ea5b1de636876ecfed7fdc0e046 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 16:12:14 +0200 Subject: [PATCH 18/45] reduce the use of as_numpy_array() --- src/zarr/buffer.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index ccd0be3525..a592ae69e6 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -89,7 +89,7 @@ def from_numpy_array(cls, array: np.ArrayLike) -> Self: return cls(np.asanyarray(array)) def as_buffer(self) -> Buffer: - return Buffer(np.frombuffer(self.as_numpy_array().reshape(-1), dtype="b")) + return Buffer(self._data.reshape(-1).view(dtype="b")) def as_numpy_array(self, dtype: Optional[np.DTypeLike] = None) -> np.ndarray: if dtype is None: @@ -98,11 +98,11 @@ def as_numpy_array(self, dtype: Optional[np.DTypeLike] = None) -> np.ndarray: @property def dtype(self) -> np.dtype[Any]: - return self.as_numpy_array().dtype + return self._data.dtype @property def shape(self) -> Tuple[int, ...]: - return self.as_numpy_array().shape + return self._data.shape @property def byteorder(self) -> Endian: @@ -116,30 +116,30 @@ def byteorder(self) -> Endian: return Endian(sys.byteorder) def reshape(self, newshape: Iterable[int]) -> Self: - return self.__class__(self.as_numpy_array().reshape(newshape)) + return self.__class__(self._data.reshape(newshape)) def astype(self, dtype: np.DTypeLike, order: Literal["K", "A", "C", "F"] = "K") -> Self: - return self.__class__(self.as_numpy_array().astype(dtype=dtype, order=order)) + return self.__class__(self._data.astype(dtype=dtype, order=order)) def __getitem__(self, key: Any) -> Self: - return self.__class__(np.asanyarray(self.as_numpy_array().__getitem__(key))) + return self.__class__(np.asanyarray(self._data.__getitem__(key))) def __setitem__(self, key: Any, value: Any) -> None: if isinstance(value, NDBuffer): - value = value.as_numpy_array() - self.as_numpy_array().__setitem__(key, value) + value = value._data + self._data.__setitem__(key, value) def __len__(self) -> int: - return self.as_numpy_array().__len__() + return self._data.__len__() def fill(self, value: Any) -> None: - self.as_numpy_array().fill(value) + self._data.fill(value) def copy(self) -> Self: - return self.__class__(self.as_numpy_array().copy()) + return self.__class__(self._data.copy()) def transpose(self, *axes: np.SupportsIndex) -> Self: - return self.__class__(self.as_numpy_array().transpose(*axes)) + return self.__class__(self._data.transpose(*axes)) def as_bytes_wrapper(func: Callable[[bytes], bytes], buf: Buffer) -> Buffer: From 43ebafeda7b22439ed3b26434954ecf7ad27ee8a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 16:25:41 +0200 Subject: [PATCH 19/45] impl. and use NDBuffer.all_equal --- src/zarr/array.py | 2 +- src/zarr/buffer.py | 3 +++ src/zarr/codecs/sharding.py | 4 ++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index 8f5be1b223..fab81d0dac 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -343,7 +343,7 @@ async def _write_chunk( async def _write_chunk_to_store( self, store_path: StorePath, chunk_array: NDBuffer, chunk_spec: ArraySpec ) -> None: - if np.all(chunk_array.as_numpy_array() == self.metadata.fill_value): + if chunk_array.all_equal(self.metadata.fill_value): # chunks that only contain fill_value will be removed await store_path.delete() else: diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index a592ae69e6..d758e979af 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -132,6 +132,9 @@ def __setitem__(self, key: Any, value: Any) -> None: def __len__(self) -> int: return self._data.__len__() + def all_equal(self, other: Any) -> bool: + return bool((self._data == other).all()) + def fill(self, value: Any) -> None: self._data.fill(value) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 8aaab78703..6c74f944bb 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -459,7 +459,7 @@ async def _write_chunk( ) chunk_array.fill(shard_spec.fill_value) chunk_array[chunk_selection] = shard_array[out_selection] - if not np.array_equiv(chunk_array.as_numpy_array(), shard_spec.fill_value): + if not chunk_array.all_equal(shard_spec.fill_value): chunk_spec = self._get_chunk_spec(shard_spec) return ( chunk_coords, @@ -538,7 +538,7 @@ async def _write_chunk( ).copy() # make a writable copy chunk_array[chunk_selection] = shard_array[out_selection] - if not np.array_equiv(chunk_array.as_numpy_array(), shard_spec.fill_value): + if not chunk_array.all_equal(shard_spec.fill_value): return ( chunk_coords, await self.codecs.encode(chunk_array, chunk_spec), From d01557e45f15c2b7ffff1ccd356aed2abcf5bf29 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 16:29:42 +0200 Subject: [PATCH 20/45] as_numpy_array(): doc --- src/zarr/buffer.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index d758e979af..310fe17763 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -91,10 +91,18 @@ def from_numpy_array(cls, array: np.ArrayLike) -> Self: def as_buffer(self) -> Buffer: return Buffer(self._data.reshape(-1).view(dtype="b")) - def as_numpy_array(self, dtype: Optional[np.DTypeLike] = None) -> np.ndarray: - if dtype is None: - return self._data - return self._data.astype(dtype=dtype, copy=False) + def as_numpy_array(self) -> np.ndarray: + """Return the buffer as a NumPy array. + + Warning + ------- + Might have to copy data, only use this method for small buffers such as metadata + + Return + ------ + NumPy array of this buffer (might be a data copy) + """ + return self._data @property def dtype(self) -> np.dtype[Any]: From c74f2661226da860c96392268b09aecae2497c1b Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 16:40:14 +0200 Subject: [PATCH 21/45] remove as_bytearray() --- src/zarr/array_v2.py | 9 ++++++++- src/zarr/buffer.py | 8 +------- tests/v3/test_codecs.py | 15 ++++++--------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/zarr/array_v2.py b/src/zarr/array_v2.py index 2254b0e7c3..cc4ca7dd8a 100644 --- a/src/zarr/array_v2.py +++ b/src/zarr/array_v2.py @@ -10,7 +10,7 @@ from numcodecs.compat import ensure_bytes, ensure_ndarray -from zarr.buffer import Buffer, NDBuffer, as_bytearray +from zarr.buffer import Buffer, NDBuffer from zarr.common import ( ZARRAY_JSON, ZATTRS_JSON, @@ -30,6 +30,13 @@ from zarr.array import Array +def as_bytearray(data: Optional[Buffer]) -> Optional[bytes]: + """Help function to convert a Buffer into bytes if not None""" + if data is None: + return data + return data.to_bytes() + + @dataclass(frozen=True) class _AsyncArrayProxy: array: ArrayV2 diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 310fe17763..5b50b54084 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -1,7 +1,7 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, Tuple +from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Tuple import numpy as np from zarr.common import BytesLike @@ -155,9 +155,3 @@ def transpose(self, *axes: np.SupportsIndex) -> Self: def as_bytes_wrapper(func: Callable[[bytes], bytes], buf: Buffer) -> Buffer: return Buffer.from_bytes(func(buf.to_bytes())) - - -def as_bytearray(data: Optional[Buffer]) -> Optional[bytes]: - if data is None: - return data - return data.to_bytes() diff --git a/tests/v3/test_codecs.py b/tests/v3/test_codecs.py index a653c9ce92..85b21534fb 100644 --- a/tests/v3/test_codecs.py +++ b/tests/v3/test_codecs.py @@ -7,7 +7,6 @@ import numpy as np import pytest -from zarr.buffer import as_bytearray import zarr.v2 from zarr.abc.codec import Codec from zarr.array import Array, AsyncArray @@ -738,9 +737,9 @@ async def test_dimension_names(store: Store): ) assert (await AsyncArray.open(store / "dimension_names2")).metadata.dimension_names is None - zarr_json_bytes = as_bytearray(await (store / "dimension_names2" / "zarr.json").get()) - assert zarr_json_bytes is not None - assert "dimension_names" not in json.loads(zarr_json_bytes) + zarr_json_buffer = await (store / "dimension_names2" / "zarr.json").get() + assert zarr_json_buffer is not None + assert "dimension_names" not in json.loads(zarr_json_buffer.to_bytes()) def test_gzip(store: Store): @@ -966,7 +965,7 @@ async def test_blosc_evolve(store: Store): codecs=[BytesCodec(), BloscCodec()], ) - zarr_json = json.loads(as_bytearray(await (store / "blosc_evolve_u1" / "zarr.json").get())) + zarr_json = json.loads((await (store / "blosc_evolve_u1" / "zarr.json").get()).to_bytes()) blosc_configuration_json = zarr_json["codecs"][1]["configuration"] assert blosc_configuration_json["typesize"] == 1 assert blosc_configuration_json["shuffle"] == "bitshuffle" @@ -980,7 +979,7 @@ async def test_blosc_evolve(store: Store): codecs=[BytesCodec(), BloscCodec()], ) - zarr_json = json.loads(as_bytearray(await (store / "blosc_evolve_u2" / "zarr.json").get())) + zarr_json = json.loads((await (store / "blosc_evolve_u2" / "zarr.json").get()).to_bytes()) blosc_configuration_json = zarr_json["codecs"][1]["configuration"] assert blosc_configuration_json["typesize"] == 2 assert blosc_configuration_json["shuffle"] == "shuffle" @@ -994,9 +993,7 @@ async def test_blosc_evolve(store: Store): codecs=[ShardingCodec(chunk_shape=(16, 16), codecs=[BytesCodec(), BloscCodec()])], ) - zarr_json = json.loads( - as_bytearray(await (store / "sharding_blosc_evolve" / "zarr.json").get()) - ) + zarr_json = json.loads((await (store / "sharding_blosc_evolve" / "zarr.json").get()).to_bytes()) blosc_configuration_json = zarr_json["codecs"][0]["configuration"]["codecs"][1]["configuration"] assert blosc_configuration_json["typesize"] == 2 assert blosc_configuration_json["shuffle"] == "shuffle" From 6fce5a9e44f80cab14ee0a7c3363f6c67a61f7c9 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 17:25:11 +0200 Subject: [PATCH 22/45] impl. Buffer.from_numpy_array() --- src/zarr/buffer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 5b50b54084..5f8986f362 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -26,9 +26,13 @@ def __init__(self, array: np.ndarray): def create_empty(cls, *, nbytes: int) -> Self: return cls(np.empty(shape=(nbytes,), dtype="b")) + @classmethod + def from_numpy_array(cls, array: np.ArrayLike) -> Self: + return cls(np.asanyarray(array).reshape(-1).view(dtype="b")) + @classmethod def from_bytes(cls, data: BytesLike) -> Self: - return cls(np.frombuffer(data, dtype="b")) + return cls.from_numpy_array(np.frombuffer(data, dtype="b")) def as_nd_buffer(self, *, dtype: np.DTypeLike) -> NDBuffer: return NDBuffer(self._data.view(dtype=dtype)) From c37312ba11bcf36fde030b6d01b2d31385549b08 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 19:32:11 +0200 Subject: [PATCH 23/45] NDArrayLike --- src/zarr/buffer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 5f8986f362..c27ce12ffb 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -1,7 +1,7 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Tuple +from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Tuple, TypeAlias import numpy as np from zarr.common import BytesLike @@ -11,12 +11,13 @@ from typing_extensions import Self from zarr.codecs.bytes import Endian +NDArrayLike: TypeAlias = np.ndarray + class Buffer: """A flat contiguous version of `NDBuffer` with an item size of 1""" - def __init__(self, array: np.ndarray): - assert isinstance(array, np.ndarray) + def __init__(self, array: NDArrayLike): assert array.ndim == 1 assert array.itemsize == 1 assert array.dtype == np.dtype("b") @@ -71,8 +72,8 @@ class NDBuffer: of memory such as CUDA device memory. """ - def __init__(self, array: np.ndarray): - assert isinstance(array, np.ndarray) + def __init__(self, array: NDArrayLike): + assert array.ndim > 0 assert array.dtype != object self._data = array From 925fa59d1915df90daad13028121f045a15e2a7b Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 19:50:38 +0200 Subject: [PATCH 24/45] Factory.Create --- src/zarr/array.py | 9 ++++++--- src/zarr/array_v2.py | 9 ++++++--- src/zarr/buffer.py | 30 ++++++++++++++++++------------ src/zarr/codecs/sharding.py | 16 ++++++++++------ 4 files changed, 40 insertions(+), 24 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index fab81d0dac..6edc636b52 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -193,7 +193,8 @@ async def getitem(self, selection: Selection) -> npt.NDArray[Any]: ) # setup output array - out = NDBuffer.create_zeros( + out = NDBuffer.create( + factory=np.zeros, shape=indexer.shape, dtype=self.metadata.dtype, order=self.order, @@ -303,7 +304,8 @@ async def _write_chunk( if is_total_slice(chunk_selection, chunk_shape): # write entire chunks if np.isscalar(value): - chunk_array = NDBuffer.create_empty( + chunk_array = NDBuffer.create( + factory=np.empty, shape=chunk_shape, dtype=self.metadata.dtype, ) @@ -327,7 +329,8 @@ async def _write_chunk( # merge new value if chunk_bytes is None: - chunk_array = NDBuffer.create_empty( + chunk_array = NDBuffer.create( + factory=np.empty, shape=chunk_shape, dtype=self.metadata.dtype, ) diff --git a/src/zarr/array_v2.py b/src/zarr/array_v2.py index cc4ca7dd8a..ad58d1ceef 100644 --- a/src/zarr/array_v2.py +++ b/src/zarr/array_v2.py @@ -224,7 +224,8 @@ async def get_async(self, selection: Selection): ) # setup output array - out = NDBuffer.create_zeros( + out = NDBuffer.create( + factory=np.zeros, shape=indexer.shape, dtype=self.metadata.dtype, order=self.metadata.order, @@ -341,7 +342,8 @@ async def _write_chunk( if is_total_slice(chunk_selection, chunk_shape): # write entire chunks if np.isscalar(value): - chunk_array = NDBuffer.create_empty( + chunk_array = NDBuffer.create( + factory=np.empty, shape=chunk_shape, dtype=self.metadata.dtype, order=self.metadata.order, @@ -358,7 +360,8 @@ async def _write_chunk( # merge new value if tmp is None: - chunk_array = NDBuffer.create_empty( + chunk_array = NDBuffer.create( + factory=np.empty, shape=chunk_shape, dtype=self.metadata.dtype, order=self.metadata.order, diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index c27ce12ffb..f96b6dff5b 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -1,7 +1,7 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Tuple, TypeAlias +from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Protocol, Tuple, TypeAlias import numpy as np from zarr.common import BytesLike @@ -14,6 +14,13 @@ NDArrayLike: TypeAlias = np.ndarray +class Factory: + class Create(Protocol): + def __call__( + self, shape: Iterable[int], dtype: np.DTypeLike, order: Literal["C", "F"] + ) -> NDArrayLike: ... + + class Buffer: """A flat contiguous version of `NDBuffer` with an item size of 1""" @@ -24,8 +31,8 @@ def __init__(self, array: NDArrayLike): self._data = array @classmethod - def create_empty(cls, *, nbytes: int) -> Self: - return cls(np.empty(shape=(nbytes,), dtype="b")) + def create(cls, *, factory: Factory.Create, nbytes: int) -> Self: + return cls(factory(shape=(nbytes,), dtype="b", order="C")) @classmethod def from_numpy_array(cls, array: np.ArrayLike) -> Self: @@ -78,16 +85,15 @@ def __init__(self, array: NDArrayLike): self._data = array @classmethod - def create_empty( - cls, *, shape: Iterable[int], dtype: np.DTypeLike, order: Literal["C", "F"] = "C" - ) -> Self: - return cls(np.empty(shape=shape, dtype=dtype, order=order)) - - @classmethod - def create_zeros( - cls, *, shape: Iterable[int], dtype: np.DTypeLike, order: Literal["C", "F"] = "C" + def create( + cls, + *, + factory: Factory.Create, + shape: Iterable[int], + dtype: np.DTypeLike, + order: Literal["C", "F"] = "C", ) -> Self: - return cls(np.zeros(shape=shape, dtype=dtype, order=order)) + return cls(factory(shape=shape, dtype=dtype, order=order)) @classmethod def from_numpy_array(cls, array: np.ArrayLike) -> Self: diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 6c74f944bb..4c5c294337 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -148,7 +148,7 @@ async def from_bytes( def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardProxy: index = _ShardIndex.create_empty(chunks_per_shard) obj = cls() - obj.buf = Buffer.create_empty(nbytes=0) + obj.buf = Buffer.create(factory=np.empty, nbytes=0) obj.index = index return obj @@ -190,7 +190,7 @@ def merge_with_morton_order( @classmethod def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardBuilder: obj = cls() - obj.buf = Buffer.create_empty(nbytes=0) + obj.buf = Buffer.create(factory=np.empty, nbytes=0) obj.index = _ShardIndex.create_empty(chunks_per_shard) return obj @@ -312,7 +312,8 @@ async def decode( ) # setup output array - out = NDBuffer.create_zeros( + out = NDBuffer.create( + factory=np.zeros, shape=shard_shape, dtype=shard_spec.dtype, order=shard_spec.order, @@ -359,7 +360,8 @@ async def decode_partial( ) # setup output array - out = NDBuffer.create_zeros( + out = NDBuffer.create( + factory=np.zeros, shape=indexer.shape, dtype=shard_spec.dtype, order=shard_spec.order, @@ -453,7 +455,8 @@ async def _write_chunk( chunk_array = shard_array[out_selection] else: # handling writing partial chunks - chunk_array = NDBuffer.create_empty( + chunk_array = NDBuffer.create( + factory=np.empty, shape=chunk_shape, dtype=shard_spec.dtype, ) @@ -527,7 +530,8 @@ async def _write_chunk( # merge new value if chunk_bytes is None: - chunk_array = NDBuffer.create_empty( + chunk_array = NDBuffer.create( + factory=np.empty, shape=self.chunk_shape, dtype=shard_spec.dtype, ) From 1bbeefc0b244b73f1c4cc4c9ffac3a088c4a4149 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 19:57:25 +0200 Subject: [PATCH 25/45] Factory.FromNumpy --- src/zarr/array.py | 2 +- src/zarr/buffer.py | 13 ++++++++----- src/zarr/codecs/sharding.py | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index 6edc636b52..51d40723f6 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -270,7 +270,7 @@ async def setitem(self, selection: Selection, value: npt.NDArray[Any]) -> None: # We accept a numpy array as input from the user and convert it to a NDBuffer. # From this point onwards, we only pass Buffer and NDBuffer between components. - value = NDBuffer.from_numpy_array(value) + value = NDBuffer.from_numpy_array(factory=np.asanyarray, array_like=value) # merging with existing data and encoding chunks await concurrent_map( diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index f96b6dff5b..6aea159918 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -20,6 +20,9 @@ def __call__( self, shape: Iterable[int], dtype: np.DTypeLike, order: Literal["C", "F"] ) -> NDArrayLike: ... + class FromNumpy(Protocol): + def __call__(self, array_like: np.ArrayLike) -> NDArrayLike: ... + class Buffer: """A flat contiguous version of `NDBuffer` with an item size of 1""" @@ -35,12 +38,12 @@ def create(cls, *, factory: Factory.Create, nbytes: int) -> Self: return cls(factory(shape=(nbytes,), dtype="b", order="C")) @classmethod - def from_numpy_array(cls, array: np.ArrayLike) -> Self: - return cls(np.asanyarray(array).reshape(-1).view(dtype="b")) + def from_numpy_array(cls, *, factory: Factory.FromNumpy, array_like: np.ArrayLike) -> Self: + return cls(factory(array_like).reshape(-1).view(dtype="b")) @classmethod def from_bytes(cls, data: BytesLike) -> Self: - return cls.from_numpy_array(np.frombuffer(data, dtype="b")) + return cls.from_numpy_array(factory=np.asarray, array_like=np.frombuffer(data, dtype="b")) def as_nd_buffer(self, *, dtype: np.DTypeLike) -> NDBuffer: return NDBuffer(self._data.view(dtype=dtype)) @@ -96,8 +99,8 @@ def create( return cls(factory(shape=shape, dtype=dtype, order=order)) @classmethod - def from_numpy_array(cls, array: np.ArrayLike) -> Self: - return cls(np.asanyarray(array)) + def from_numpy_array(cls, *, factory: Factory.FromNumpy, array_like: np.ArrayLike) -> Self: + return cls(factory(array_like)) def as_buffer(self) -> Buffer: return Buffer(self._data.reshape(-1).view(dtype="b")) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 4c5c294337..1239bbe6ce 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -607,7 +607,7 @@ async def _decode_shard_index( async def _encode_shard_index(self, index: _ShardIndex) -> Buffer: index_bytes = await self.index_codecs.encode( - NDBuffer.from_numpy_array(index.offsets_and_lengths), + NDBuffer.from_numpy_array(factory=np.asarray, array_like=index.offsets_and_lengths), self._get_index_chunk_spec(index.chunks_per_shard), ) assert index_bytes is not None From 11595675d4bceee38b5fca8cecd95916f066202d Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 20:01:41 +0200 Subject: [PATCH 26/45] doc --- src/zarr/buffer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 6aea159918..2325478fce 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -11,6 +11,7 @@ from typing_extensions import Self from zarr.codecs.bytes import Endian +# TODO: create a protocol for the attributes we need NDArrayLike: TypeAlias = np.ndarray @@ -25,7 +26,12 @@ def __call__(self, array_like: np.ArrayLike) -> NDArrayLike: ... class Buffer: - """A flat contiguous version of `NDBuffer` with an item size of 1""" + """A flat contiguous memory block + + We use `Buffer` throughout Zarr to represent a contiguous block of memory. + For now, we only support host memory but the plan is to support other types + of memory such as CUDA device memory. + """ def __init__(self, array: NDArrayLike): assert array.ndim == 1 From 26d67083f756140a0dff314fa7804f027d9f75f3 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 22:17:15 +0200 Subject: [PATCH 27/45] doc --- src/zarr/buffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 2325478fce..11e1673464 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -11,7 +11,7 @@ from typing_extensions import Self from zarr.codecs.bytes import Endian -# TODO: create a protocol for the attributes we need +# TODO: create a protocol for the attributes we need, for now we just aliasing numpy NDArrayLike: TypeAlias = np.ndarray From 5ce21a0533c25b1d02bbc6f9cc343c5575a23eb6 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 13 May 2024 22:42:15 +0200 Subject: [PATCH 28/45] remove the buffer factories again --- src/zarr/array.py | 11 ++++------ src/zarr/array_v2.py | 9 +++----- src/zarr/buffer.py | 43 +++++++++++++++++++------------------ src/zarr/codecs/sharding.py | 18 ++++++---------- 4 files changed, 36 insertions(+), 45 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index 51d40723f6..fab81d0dac 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -193,8 +193,7 @@ async def getitem(self, selection: Selection) -> npt.NDArray[Any]: ) # setup output array - out = NDBuffer.create( - factory=np.zeros, + out = NDBuffer.create_zeros( shape=indexer.shape, dtype=self.metadata.dtype, order=self.order, @@ -270,7 +269,7 @@ async def setitem(self, selection: Selection, value: npt.NDArray[Any]) -> None: # We accept a numpy array as input from the user and convert it to a NDBuffer. # From this point onwards, we only pass Buffer and NDBuffer between components. - value = NDBuffer.from_numpy_array(factory=np.asanyarray, array_like=value) + value = NDBuffer.from_numpy_array(value) # merging with existing data and encoding chunks await concurrent_map( @@ -304,8 +303,7 @@ async def _write_chunk( if is_total_slice(chunk_selection, chunk_shape): # write entire chunks if np.isscalar(value): - chunk_array = NDBuffer.create( - factory=np.empty, + chunk_array = NDBuffer.create_empty( shape=chunk_shape, dtype=self.metadata.dtype, ) @@ -329,8 +327,7 @@ async def _write_chunk( # merge new value if chunk_bytes is None: - chunk_array = NDBuffer.create( - factory=np.empty, + chunk_array = NDBuffer.create_empty( shape=chunk_shape, dtype=self.metadata.dtype, ) diff --git a/src/zarr/array_v2.py b/src/zarr/array_v2.py index ad58d1ceef..cc4ca7dd8a 100644 --- a/src/zarr/array_v2.py +++ b/src/zarr/array_v2.py @@ -224,8 +224,7 @@ async def get_async(self, selection: Selection): ) # setup output array - out = NDBuffer.create( - factory=np.zeros, + out = NDBuffer.create_zeros( shape=indexer.shape, dtype=self.metadata.dtype, order=self.metadata.order, @@ -342,8 +341,7 @@ async def _write_chunk( if is_total_slice(chunk_selection, chunk_shape): # write entire chunks if np.isscalar(value): - chunk_array = NDBuffer.create( - factory=np.empty, + chunk_array = NDBuffer.create_empty( shape=chunk_shape, dtype=self.metadata.dtype, order=self.metadata.order, @@ -360,8 +358,7 @@ async def _write_chunk( # merge new value if tmp is None: - chunk_array = NDBuffer.create( - factory=np.empty, + chunk_array = NDBuffer.create_empty( shape=chunk_shape, dtype=self.metadata.dtype, order=self.metadata.order, diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 11e1673464..ec0f4c52d6 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -1,7 +1,7 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Protocol, Tuple, TypeAlias +from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Tuple, TypeAlias import numpy as np from zarr.common import BytesLike @@ -15,16 +15,6 @@ NDArrayLike: TypeAlias = np.ndarray -class Factory: - class Create(Protocol): - def __call__( - self, shape: Iterable[int], dtype: np.DTypeLike, order: Literal["C", "F"] - ) -> NDArrayLike: ... - - class FromNumpy(Protocol): - def __call__(self, array_like: np.ArrayLike) -> NDArrayLike: ... - - class Buffer: """A flat contiguous memory block @@ -40,16 +30,16 @@ def __init__(self, array: NDArrayLike): self._data = array @classmethod - def create(cls, *, factory: Factory.Create, nbytes: int) -> Self: - return cls(factory(shape=(nbytes,), dtype="b", order="C")) + def create_empty(cls, *, nbytes: int) -> Self: + return cls(np.empty(shape=(nbytes,), dtype="b", order="C")) @classmethod - def from_numpy_array(cls, *, factory: Factory.FromNumpy, array_like: np.ArrayLike) -> Self: - return cls(factory(array_like).reshape(-1).view(dtype="b")) + def from_numpy_array(cls, array_like: np.ArrayLike) -> Self: + return cls(np.asarray(array_like).reshape(-1).view(dtype="b")) @classmethod def from_bytes(cls, data: BytesLike) -> Self: - return cls.from_numpy_array(factory=np.asarray, array_like=np.frombuffer(data, dtype="b")) + return cls.from_numpy_array(np.frombuffer(data, dtype="b")) def as_nd_buffer(self, *, dtype: np.DTypeLike) -> NDBuffer: return NDBuffer(self._data.view(dtype=dtype)) @@ -94,19 +84,30 @@ def __init__(self, array: NDArrayLike): self._data = array @classmethod - def create( + def create_empty( + cls, + *, + shape: Iterable[int], + dtype: np.DTypeLike, + order: Literal["C", "F"] = "C", + ) -> Self: + return cls(np.empty(shape=shape, dtype=dtype, order=order)) + + @classmethod + def create_zeros( cls, *, - factory: Factory.Create, shape: Iterable[int], dtype: np.DTypeLike, order: Literal["C", "F"] = "C", ) -> Self: - return cls(factory(shape=shape, dtype=dtype, order=order)) + ret = cls.create_empty(shape=shape, dtype=dtype, order=order) + ret[...] = 0 + return ret @classmethod - def from_numpy_array(cls, *, factory: Factory.FromNumpy, array_like: np.ArrayLike) -> Self: - return cls(factory(array_like)) + def from_numpy_array(cls, array_like: np.ArrayLike) -> Self: + return cls(np.asanyarray(array_like)) def as_buffer(self) -> Buffer: return Buffer(self._data.reshape(-1).view(dtype="b")) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 1239bbe6ce..6c74f944bb 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -148,7 +148,7 @@ async def from_bytes( def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardProxy: index = _ShardIndex.create_empty(chunks_per_shard) obj = cls() - obj.buf = Buffer.create(factory=np.empty, nbytes=0) + obj.buf = Buffer.create_empty(nbytes=0) obj.index = index return obj @@ -190,7 +190,7 @@ def merge_with_morton_order( @classmethod def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardBuilder: obj = cls() - obj.buf = Buffer.create(factory=np.empty, nbytes=0) + obj.buf = Buffer.create_empty(nbytes=0) obj.index = _ShardIndex.create_empty(chunks_per_shard) return obj @@ -312,8 +312,7 @@ async def decode( ) # setup output array - out = NDBuffer.create( - factory=np.zeros, + out = NDBuffer.create_zeros( shape=shard_shape, dtype=shard_spec.dtype, order=shard_spec.order, @@ -360,8 +359,7 @@ async def decode_partial( ) # setup output array - out = NDBuffer.create( - factory=np.zeros, + out = NDBuffer.create_zeros( shape=indexer.shape, dtype=shard_spec.dtype, order=shard_spec.order, @@ -455,8 +453,7 @@ async def _write_chunk( chunk_array = shard_array[out_selection] else: # handling writing partial chunks - chunk_array = NDBuffer.create( - factory=np.empty, + chunk_array = NDBuffer.create_empty( shape=chunk_shape, dtype=shard_spec.dtype, ) @@ -530,8 +527,7 @@ async def _write_chunk( # merge new value if chunk_bytes is None: - chunk_array = NDBuffer.create( - factory=np.empty, + chunk_array = NDBuffer.create_empty( shape=self.chunk_shape, dtype=shard_spec.dtype, ) @@ -607,7 +603,7 @@ async def _decode_shard_index( async def _encode_shard_index(self, index: _ShardIndex) -> Buffer: index_bytes = await self.index_codecs.encode( - NDBuffer.from_numpy_array(factory=np.asarray, array_like=index.offsets_and_lengths), + NDBuffer.from_numpy_array(index.offsets_and_lengths), self._get_index_chunk_spec(index.chunks_per_shard), ) assert index_bytes is not None From be9dce38944e5a4ebf19f182afdfccda20472de5 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 09:37:21 +0200 Subject: [PATCH 29/45] NDBuffer.create(): take fill_value --- src/zarr/array.py | 16 ++++++---------- src/zarr/array_v2.py | 14 ++++++-------- src/zarr/buffer.py | 20 ++++++-------------- src/zarr/codecs/sharding.py | 16 ++++++---------- 4 files changed, 24 insertions(+), 42 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index fab81d0dac..9137f74644 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -193,10 +193,8 @@ async def getitem(self, selection: Selection) -> npt.NDArray[Any]: ) # setup output array - out = NDBuffer.create_zeros( - shape=indexer.shape, - dtype=self.metadata.dtype, - order=self.order, + out = NDBuffer.create( + shape=indexer.shape, dtype=self.metadata.dtype, order=self.order, fill_value=0 ) # reading chunks and decoding them @@ -303,11 +301,9 @@ async def _write_chunk( if is_total_slice(chunk_selection, chunk_shape): # write entire chunks if np.isscalar(value): - chunk_array = NDBuffer.create_empty( - shape=chunk_shape, - dtype=self.metadata.dtype, + chunk_array = NDBuffer.create( + shape=chunk_shape, dtype=self.metadata.dtype, fill_value=value ) - chunk_array.fill(value) else: chunk_array = value[out_selection] await self._write_chunk_to_store(store_path, chunk_array, chunk_spec) @@ -327,11 +323,11 @@ async def _write_chunk( # merge new value if chunk_bytes is None: - chunk_array = NDBuffer.create_empty( + chunk_array = NDBuffer.create( shape=chunk_shape, dtype=self.metadata.dtype, + fill_value=self.metadata.fill_value, ) - chunk_array.fill(self.metadata.fill_value) else: chunk_array = ( await self.codecs.decode(chunk_bytes, chunk_spec) diff --git a/src/zarr/array_v2.py b/src/zarr/array_v2.py index cc4ca7dd8a..053d58eb1a 100644 --- a/src/zarr/array_v2.py +++ b/src/zarr/array_v2.py @@ -224,10 +224,8 @@ async def get_async(self, selection: Selection): ) # setup output array - out = NDBuffer.create_zeros( - shape=indexer.shape, - dtype=self.metadata.dtype, - order=self.metadata.order, + out = NDBuffer.create( + shape=indexer.shape, dtype=self.metadata.dtype, order=self.metadata.order, fill_value=0 ) # reading chunks and decoding them @@ -341,12 +339,12 @@ async def _write_chunk( if is_total_slice(chunk_selection, chunk_shape): # write entire chunks if np.isscalar(value): - chunk_array = NDBuffer.create_empty( + chunk_array = NDBuffer.create( shape=chunk_shape, dtype=self.metadata.dtype, order=self.metadata.order, + fill_value=value, ) - chunk_array.fill(value) else: chunk_array = value[out_selection] await self._write_chunk_to_store(store_path, chunk_array) @@ -358,12 +356,12 @@ async def _write_chunk( # merge new value if tmp is None: - chunk_array = NDBuffer.create_empty( + chunk_array = NDBuffer.create( shape=chunk_shape, dtype=self.metadata.dtype, order=self.metadata.order, + fill_value=self.metadata.fill_value, ) - chunk_array.fill(self.metadata.fill_value) else: chunk_array = tmp.copy( order=self.metadata.order, diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index ec0f4c52d6..3df119c292 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -1,7 +1,7 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Tuple, TypeAlias +from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, Tuple, TypeAlias import numpy as np from zarr.common import BytesLike @@ -84,25 +84,17 @@ def __init__(self, array: NDArrayLike): self._data = array @classmethod - def create_empty( + def create( cls, *, shape: Iterable[int], dtype: np.DTypeLike, order: Literal["C", "F"] = "C", + fill_value: Optional[Any] = None, ) -> Self: - return cls(np.empty(shape=shape, dtype=dtype, order=order)) - - @classmethod - def create_zeros( - cls, - *, - shape: Iterable[int], - dtype: np.DTypeLike, - order: Literal["C", "F"] = "C", - ) -> Self: - ret = cls.create_empty(shape=shape, dtype=dtype, order=order) - ret[...] = 0 + ret = cls(np.empty(shape=shape, dtype=dtype, order=order)) + if fill_value is not None: + ret.fill(fill_value) return ret @classmethod diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 6c74f944bb..41902ffc13 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -312,10 +312,8 @@ async def decode( ) # setup output array - out = NDBuffer.create_zeros( - shape=shard_shape, - dtype=shard_spec.dtype, - order=shard_spec.order, + out = NDBuffer.create( + shape=shard_shape, dtype=shard_spec.dtype, order=shard_spec.order, fill_value=0 ) shard_dict = await _ShardProxy.from_bytes(shard_bytes, self, chunks_per_shard) @@ -359,10 +357,8 @@ async def decode_partial( ) # setup output array - out = NDBuffer.create_zeros( - shape=indexer.shape, - dtype=shard_spec.dtype, - order=shard_spec.order, + out = NDBuffer.create( + shape=indexer.shape, dtype=shard_spec.dtype, order=shard_spec.order, fill_value=0 ) indexed_chunks = list(indexer) @@ -453,7 +449,7 @@ async def _write_chunk( chunk_array = shard_array[out_selection] else: # handling writing partial chunks - chunk_array = NDBuffer.create_empty( + chunk_array = NDBuffer.create( shape=chunk_shape, dtype=shard_spec.dtype, ) @@ -527,7 +523,7 @@ async def _write_chunk( # merge new value if chunk_bytes is None: - chunk_array = NDBuffer.create_empty( + chunk_array = NDBuffer.create( shape=self.chunk_shape, dtype=shard_spec.dtype, ) From 57e3dd615394bc3f34394759c6471db30e12ba1a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 11:23:50 +0200 Subject: [PATCH 30/45] getitem and setitem now use factory --- src/zarr/array.py | 29 ++++++++++++++-------------- src/zarr/buffer.py | 47 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 58 insertions(+), 18 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index 936834fd34..d718117264 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -20,7 +20,7 @@ # from zarr.array_v2 import ArrayV2 -from zarr.buffer import Buffer, NDBuffer +from zarr.buffer import Buffer, Factory, NDArrayLike, NDBuffer from zarr.codecs import BytesCodec from zarr.codecs.pipeline import CodecPipeline from zarr.common import ( @@ -185,7 +185,9 @@ def dtype(self) -> np.dtype[Any]: def attrs(self) -> dict[str, Any]: return self.metadata.attributes - async def getitem(self, selection: Selection) -> npt.NDArray[Any]: + async def getitem( + self, selection: Selection, *, factory: Factory.Create = NDBuffer.create + ) -> NDArrayLike: assert isinstance(self.metadata.chunk_grid, RegularChunkGrid) indexer = BasicIndexer( selection, @@ -194,7 +196,7 @@ async def getitem(self, selection: Selection) -> npt.NDArray[Any]: ) # setup output array - out = NDBuffer.create( + out = factory( shape=indexer.shape, dtype=self.metadata.dtype, order=self.order, fill_value=0 ) @@ -207,12 +209,7 @@ async def getitem(self, selection: Selection) -> npt.NDArray[Any]: self._read_chunk, config.get("async.concurrency"), ) - - # We always return a numpy array to the user - if out.shape: - return out.as_numpy_array() - else: - return out.as_numpy_array()[()] + return out.as_ndarray_like() async def _save_metadata(self) -> None: await (self.store_path / ZARR_JSON).set(Buffer.from_bytes(self.metadata.to_bytes())) @@ -244,7 +241,12 @@ async def _read_chunk( else: out[out_selection] = self.metadata.fill_value - async def setitem(self, selection: Selection, value: npt.NDArray[Any]) -> None: + async def setitem( + self, + selection: Selection, + value: NDArrayLike, + factory: Factory.NDArrayLike = NDBuffer.from_ndarray_like, + ) -> None: assert isinstance(self.metadata.chunk_grid, RegularChunkGrid) chunk_shape = self.metadata.chunk_grid.chunk_shape indexer = BasicIndexer( @@ -257,8 +259,7 @@ async def setitem(self, selection: Selection, value: npt.NDArray[Any]) -> None: # check value shape if np.isscalar(value): - # setting a scalar value - pass + value = np.asanyarray(value) else: if not hasattr(value, "shape"): value = np.asarray(value, self.metadata.dtype) @@ -266,9 +267,9 @@ async def setitem(self, selection: Selection, value: npt.NDArray[Any]) -> None: if value.dtype.name != self.metadata.dtype.name: value = value.astype(self.metadata.dtype, order="A") - # We accept a numpy array as input from the user and convert it to a NDBuffer. + # We accept any ndarray like object from the user and convert it to a NDBuffer. # From this point onwards, we only pass Buffer and NDBuffer between components. - value = NDBuffer.from_numpy_array(value) + value = factory(value) # merging with existing data and encoding chunks await concurrent_map( diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 3df119c292..6eefb4c9aa 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -1,20 +1,44 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, Tuple, TypeAlias -import numpy as np - -from zarr.common import BytesLike +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Iterable, + Literal, + Optional, + Protocol, + Tuple, + TypeAlias, +) +import numpy as np if TYPE_CHECKING: from typing_extensions import Self from zarr.codecs.bytes import Endian + from zarr.common import BytesLike # TODO: create a protocol for the attributes we need, for now we just aliasing numpy NDArrayLike: TypeAlias = np.ndarray +class Factory: + class Create(Protocol): + def __call__( + self, + *, + shape: Iterable[int], + dtype: np.DTypeLike, + order: Literal["C", "F"], + fill_value: Optional[Any], + ) -> NDBuffer: ... + + class NDArrayLike(Protocol): + def __call__(self, ndarray_like: NDArrayLike) -> NDBuffer: ... + + class Buffer: """A flat contiguous memory block @@ -101,6 +125,21 @@ def create( def from_numpy_array(cls, array_like: np.ArrayLike) -> Self: return cls(np.asanyarray(array_like)) + @classmethod + def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self: + return cls(ndarray_like) + + def as_ndarray_like(self) -> NDArrayLike: + """Return the underlying array instance representing the memory of this buffer + + This will never copy data. + + Return + ------ + The underlying array such as a NumPy or CuPy array. + """ + return self._data + def as_buffer(self) -> Buffer: return Buffer(self._data.reshape(-1).view(dtype="b")) From 8bbe5c1d4d8ba55380ee71c7953149fca7194e9a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 11:54:22 +0200 Subject: [PATCH 31/45] doc --- src/zarr/array.py | 5 +++-- src/zarr/buffer.py | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index d718117264..b2932a3af3 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -267,8 +267,9 @@ async def setitem( if value.dtype.name != self.metadata.dtype.name: value = value.astype(self.metadata.dtype, order="A") - # We accept any ndarray like object from the user and convert it to a NDBuffer. - # From this point onwards, we only pass Buffer and NDBuffer between components. + # We accept any ndarray like object from the user and convert it + # to a NDBuffer (or subclass). From this point onwards, we only pass + # Buffer and NDBuffer between components. value = factory(value) # merging with existing data and encoding chunks diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 6eefb4c9aa..095cf05f49 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -33,10 +33,45 @@ def __call__( dtype: np.DTypeLike, order: Literal["C", "F"], fill_value: Optional[Any], - ) -> NDBuffer: ... + ) -> NDBuffer: + """Factory function to create a new NDBuffer (or subclass) + + Callables implementing the `Factor.Create` protocol must create a new + instance of NDBuffer (or subclass) given the following parameters. + + Parameters + ---------- + shape + The shape of the new buffer + dtype + The datatype of each element in the new buffer + order + Whether to store multi-dimensional data in row-major (C-style) or + column-major (Fortran-style) order in memory. + fill_value + If not None, fill the new buffer with a scalar value. + + Return + ------ + A new NDBuffer or subclass instance + """ class NDArrayLike(Protocol): - def __call__(self, ndarray_like: NDArrayLike) -> NDBuffer: ... + def __call__(self, ndarray_like: NDArrayLike) -> NDBuffer: + """Factory function to coerce an array into a NDBuffer (or subclass) + + Callables implementing the `Factor.NDArrayLike` protocol must return + an instance of NDBuffer (or subclass) given an ndarray-like object. + + Parameters + ---------- + ndarray_like + ndarray-like object + + Return + ------ + A NDBuffer or subclass instance that represents `ndarray_like` + """ class Buffer: From 1c64b797f672fbcd515da722b8bba9ffa4aff1be Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 14:07:26 +0200 Subject: [PATCH 32/45] test --- tests/v3/test_buffer.py | 57 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tests/v3/test_buffer.py diff --git a/tests/v3/test_buffer.py b/tests/v3/test_buffer.py new file mode 100644 index 0000000000..edb3e40a35 --- /dev/null +++ b/tests/v3/test_buffer.py @@ -0,0 +1,57 @@ +from typing import Any, Iterable, Literal, Optional, Self +import numpy as np +import numpy.typing as npt +import pytest + +from zarr.array import AsyncArray +from zarr.buffer import NDBuffer +from zarr.store.core import StorePath +from zarr.store.memory import MemoryStore + + +class MyNDArrayLike(np.ndarray): + """An example of a ndarray-like class""" + + pass + + +class MyNDBuffer(NDBuffer): + """Example of a custom NDBuffer that handles MyNDArrayLike""" + + @classmethod + def create( + cls, + *, + shape: Iterable[int], + dtype: npt.DTypeLike, + order: Literal["C", "F"] = "C", + fill_value: Optional[Any] = None, + ) -> Self: + """Overwrite `NDBuffer.create` to create an MyNDArrayLike instance""" + ret = cls(MyNDArrayLike(shape=shape, dtype=dtype, order=order)) + if fill_value is not None: + ret.fill(fill_value) + return ret + + +@pytest.mark.asyncio +async def test_async_array_factory(): + store = StorePath(MemoryStore()) + expect = np.zeros((9, 9), dtype="uint16", order="F") + a = await AsyncArray.create( + store / "test_async_array", + shape=expect.shape, + chunk_shape=(5, 5), + dtype=expect.dtype, + fill_value=0, + ) + expect[1:4, 3:6] = np.ones((3, 3)) + + await a.setitem( + selection=(slice(1, 4), slice(3, 6)), + value=np.ones((3, 3)), + factory=MyNDBuffer.from_ndarray_like, + ) + got = await a.getitem(selection=(slice(0, 9), slice(0, 9)), factory=MyNDBuffer.create) + assert isinstance(got, MyNDArrayLike) + assert np.array_equal(expect, got) From cd7eb44ac31c87ca70c91fbde4b5d5b7088954d3 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 15:27:54 +0200 Subject: [PATCH 33/45] check_item_key_is_1d_contiguous --- src/zarr/buffer.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 095cf05f49..0356568ff6 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -24,6 +24,15 @@ NDArrayLike: TypeAlias = np.ndarray +def check_item_key_is_1d_contiguous(key: Any) -> None: + if not isinstance(key, slice): + raise TypeError( + f"Item key has incorrect type (expected slice, got {key.__class__.__name__})" + ) + if not (key.step is None or key.step == 1): + raise ValueError("slice must be contiguous") + + class Factory: class Create(Protocol): def __call__( @@ -109,14 +118,16 @@ def to_bytes(self) -> bytes: def memoryview(self) -> memoryview: return memoryview(self._data) - def __getitem__(self, key: Any) -> Self: + def __getitem__(self, key: slice) -> Self: + check_item_key_is_1d_contiguous(key) return self.__class__(self._data.__getitem__(key)) - def __setitem__(self, key: Any, value: Any) -> None: + def __setitem__(self, key: slice, value: Any) -> None: + check_item_key_is_1d_contiguous(key) self._data.__setitem__(key, value) def __len__(self) -> int: - return self._data.nbytes + return self._data.size def __add__(self, other: Buffer) -> Self: return self.__class__(np.frombuffer(self.to_bytes() + other.to_bytes(), dtype="b")) From 01fcec1fe834f26f0ee91b29c8d6b33d52abbcab Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 16:22:50 +0200 Subject: [PATCH 34/45] Buffer.create_zero_length() --- src/zarr/buffer.py | 4 ++-- src/zarr/codecs/sharding.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 0356568ff6..f060a2649f 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -98,8 +98,8 @@ def __init__(self, array: NDArrayLike): self._data = array @classmethod - def create_empty(cls, *, nbytes: int) -> Self: - return cls(np.empty(shape=(nbytes,), dtype="b", order="C")) + def create_zero_length(cls) -> Self: + return cls(np.array([], dtype="b")) @classmethod def from_numpy_array(cls, array_like: np.ArrayLike) -> Self: diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 41902ffc13..b63d1e499b 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -148,7 +148,7 @@ async def from_bytes( def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardProxy: index = _ShardIndex.create_empty(chunks_per_shard) obj = cls() - obj.buf = Buffer.create_empty(nbytes=0) + obj.buf = Buffer.create_zero_length() obj.index = index return obj @@ -190,7 +190,7 @@ def merge_with_morton_order( @classmethod def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardBuilder: obj = cls() - obj.buf = Buffer.create_empty(nbytes=0) + obj.buf = Buffer.create_zero_length() obj.index = _ShardIndex.create_empty(chunks_per_shard) return obj From 9cc6edc099eb2a796b7472a78c32aa607ca92589 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 16:39:52 +0200 Subject: [PATCH 35/45] Buffer.__add__(): use concat --- src/zarr/buffer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index f060a2649f..a30fcfdf38 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -130,7 +130,8 @@ def __len__(self) -> int: return self._data.size def __add__(self, other: Buffer) -> Self: - return self.__class__(np.frombuffer(self.to_bytes() + other.to_bytes(), dtype="b")) + assert other._data.dtype == np.dtype("b") + return self.__class__(np.concatenate((self._data, other._data))) def __eq__(self, other: Any) -> bool: if isinstance(other, (bytes, bytearray)): From 40a30f1671d063e59c17660981c98800a3f70a15 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 16:54:27 +0200 Subject: [PATCH 36/45] Buffer.as_ndarray_like --- src/zarr/buffer.py | 46 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index a30fcfdf38..8e6a40a14b 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -102,17 +102,39 @@ def create_zero_length(cls) -> Self: return cls(np.array([], dtype="b")) @classmethod - def from_numpy_array(cls, array_like: np.ArrayLike) -> Self: - return cls(np.asarray(array_like).reshape(-1).view(dtype="b")) + def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self: + return cls(ndarray_like) @classmethod def from_bytes(cls, data: BytesLike) -> Self: - return cls.from_numpy_array(np.frombuffer(data, dtype="b")) + return cls.from_ndarray_like(np.frombuffer(data, dtype="b")) + + def as_ndarray_like(self) -> NDArrayLike: + """Return the underlying array that represents the memory of this buffer + + This will never copy data. + + Return + ------ + The underlying 1d array such as a NumPy or CuPy array. + """ + return self._data def as_nd_buffer(self, *, dtype: np.DTypeLike) -> NDBuffer: return NDBuffer(self._data.view(dtype=dtype)) def to_bytes(self) -> bytes: + """Return the buffer as `bytes` (host memory). + + Warning + ------- + Will always copy data, only use this method for small buffers such + as metadata. If possible, use `.as_ndarray_like()` instead. + + Return + ------ + `bytes` of this buffer (data copy) + """ return bytes(self.memoryview()) def memoryview(self) -> memoryview: @@ -130,12 +152,17 @@ def __len__(self) -> int: return self._data.size def __add__(self, other: Buffer) -> Self: - assert other._data.dtype == np.dtype("b") - return self.__class__(np.concatenate((self._data, other._data))) + other_array = other.as_ndarray_like() + assert other_array.dtype == np.dtype("b") + return self.__class__(np.concatenate((self._data, other_array))) def __eq__(self, other: Any) -> bool: if isinstance(other, (bytes, bytearray)): - return self.to_bytes() == other + # Many of the tests compares `Buffer` with `bytes` so we + # convert the bytes to a Buffer and try again + return self == self.from_bytes(other) + if isinstance(other, Buffer): + return (self._data == other.as_ndarray_like()).all() raise ValueError( f"equal operator not supported between {self.__class__} and {other.__class__}" ) @@ -144,7 +171,7 @@ def __eq__(self, other: Any) -> bool: class NDBuffer: """A n-dimensional memory block - We use `NDBuffer` throughout Zarr to represent a block of memory. + We use `NDBuffer` throughout Zarr to represent a n-dimensional memory block. For now, we only support host memory but the plan is to support other types of memory such as CUDA device memory. """ @@ -177,7 +204,7 @@ def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self: return cls(ndarray_like) def as_ndarray_like(self) -> NDArrayLike: - """Return the underlying array instance representing the memory of this buffer + """Return the underlying array that represents the memory of this buffer This will never copy data. @@ -195,7 +222,8 @@ def as_numpy_array(self) -> np.ndarray: Warning ------- - Might have to copy data, only use this method for small buffers such as metadata + Might have to copy data, only use this method for small buffers such + as metadata. If possible, use `.as_ndarray_like()` instead. Return ------ From 2421c5e7fbb555b0f27ad53a528fff3365d78a17 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 17:09:54 +0200 Subject: [PATCH 37/45] Buffer.as_numpy_array --- src/zarr/buffer.py | 35 ++++++++++++++++++++++------------- src/zarr/codecs/blosc.py | 2 +- src/zarr/codecs/crc32c_.py | 7 ++++--- src/zarr/store/local.py | 4 ++-- 4 files changed, 29 insertions(+), 19 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 8e6a40a14b..45fe27b71e 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -109,8 +109,11 @@ def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self: def from_bytes(cls, data: BytesLike) -> Self: return cls.from_ndarray_like(np.frombuffer(data, dtype="b")) + def as_nd_buffer(self, *, dtype: np.DTypeLike) -> NDBuffer: + return NDBuffer(self._data.view(dtype=dtype)) + def as_ndarray_like(self) -> NDArrayLike: - """Return the underlying array that represents the memory of this buffer + """Return the underlying array (host or device memory) of this buffer This will never copy data. @@ -120,25 +123,32 @@ def as_ndarray_like(self) -> NDArrayLike: """ return self._data - def as_nd_buffer(self, *, dtype: np.DTypeLike) -> NDBuffer: - return NDBuffer(self._data.view(dtype=dtype)) + def as_numpy_array(self) -> np.ndarray: + """Return the buffer as a NumPy array (host memory). + + Warning + ------- + Might have to copy data, consider using `.as_ndarray_like()` instead. + + Return + ------ + NumPy array of this buffer (might be a data copy) + """ + return self._data def to_bytes(self) -> bytes: """Return the buffer as `bytes` (host memory). Warning ------- - Will always copy data, only use this method for small buffers such - as metadata. If possible, use `.as_ndarray_like()` instead. + Will always copy data, only use this method for small buffers such as meta- + data. If possible, use `.as_numpy_array()` or `.as_ndarray_like()` instead. Return ------ `bytes` of this buffer (data copy) """ - return bytes(self.memoryview()) - - def memoryview(self) -> memoryview: - return memoryview(self._data) + return bytes(self.as_numpy_array()) def __getitem__(self, key: slice) -> Self: check_item_key_is_1d_contiguous(key) @@ -204,7 +214,7 @@ def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self: return cls(ndarray_like) def as_ndarray_like(self) -> NDArrayLike: - """Return the underlying array that represents the memory of this buffer + """Return the underlying array (host or device memory) of this buffer This will never copy data. @@ -218,12 +228,11 @@ def as_buffer(self) -> Buffer: return Buffer(self._data.reshape(-1).view(dtype="b")) def as_numpy_array(self) -> np.ndarray: - """Return the buffer as a NumPy array. + """Return the buffer as a NumPy array (host memory). Warning ------- - Might have to copy data, only use this method for small buffers such - as metadata. If possible, use `.as_ndarray_like()` instead. + Might have to copy data, consider using `.as_ndarray_like()` instead. Return ------ diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index ff52dba061..e7b374ab1c 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -173,7 +173,7 @@ async def encode( # Since blosc only takes bytes, we convert the input and output of the encoding # between bytes and Buffer return await to_thread( - lambda chunk: Buffer.from_bytes(self._blosc_codec.encode(chunk.memoryview())), + lambda chunk: Buffer.from_bytes(self._blosc_codec.encode(chunk.as_ndarray_like())), chunk_bytes, ) diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index ac389c1406..07c38c30df 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -35,7 +35,7 @@ async def decode( chunk_bytes: Buffer, _chunk_spec: ArraySpec, ) -> Buffer: - data = chunk_bytes.memoryview() + data = chunk_bytes.to_bytes() crc32_bytes = data[-4:] inner_bytes = data[:-4] @@ -53,8 +53,9 @@ async def encode( chunk_bytes: Buffer, _chunk_spec: ArraySpec, ) -> Optional[Buffer]: - checksum = crc32c(chunk_bytes.memoryview()) - return Buffer.from_bytes(chunk_bytes.to_bytes() + np.uint32(checksum).tobytes()) + data = chunk_bytes.to_bytes() + checksum = crc32c(data) + return Buffer.from_bytes(data + np.uint32(checksum).tobytes()) def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length + 4 diff --git a/src/zarr/store/local.py b/src/zarr/store/local.py index 9b1dbab371..738be6dc59 100644 --- a/src/zarr/store/local.py +++ b/src/zarr/store/local.py @@ -58,10 +58,10 @@ def _put( if start is not None: with path.open("r+b") as f: f.seek(start) - f.write(value.memoryview()) + f.write(value.as_numpy_array()) return None else: - return path.write_bytes(value.memoryview()) + return path.write_bytes(value.as_numpy_array()) class LocalStore(Store): From 227c0d9179448b8ccf724ad9e8a885e88868f86a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 17:24:28 +0200 Subject: [PATCH 38/45] crc32c: use as_numpy_array --- src/zarr/codecs/crc32c_.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index 07c38c30df..a20dde3770 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -35,7 +35,7 @@ async def decode( chunk_bytes: Buffer, _chunk_spec: ArraySpec, ) -> Buffer: - data = chunk_bytes.to_bytes() + data = chunk_bytes.as_numpy_array() crc32_bytes = data[-4:] inner_bytes = data[:-4] @@ -46,16 +46,18 @@ async def decode( "Stored and computed checksum do not match. " + f"Stored: {stored_checksum!r}. Computed: {computed_checksum!r}." ) - return Buffer.from_bytes(inner_bytes) + return Buffer.from_ndarray_like(inner_bytes) async def encode( self, chunk_bytes: Buffer, _chunk_spec: ArraySpec, ) -> Optional[Buffer]: - data = chunk_bytes.to_bytes() - checksum = crc32c(data) - return Buffer.from_bytes(data + np.uint32(checksum).tobytes()) + data = chunk_bytes.as_numpy_array() + # Calculate the checksum and "cast" it to a numpy array + checksum = np.array([crc32c(data)], dtype=np.uint32) + # Append the checksum (as bytes) to the data + return Buffer.from_ndarray_like(np.append(data, checksum.view("b"))) def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length + 4 From c1c218537d3347395d8863e2e03035a7afe8799f Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 17:45:49 +0200 Subject: [PATCH 39/45] as_numpy_array_wrapper --- src/zarr/buffer.py | 28 ++++++++++++++++++++++++---- src/zarr/codecs/blosc.py | 4 ++-- src/zarr/codecs/gzip.py | 6 +++--- src/zarr/codecs/zstd.py | 6 +++--- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 45fe27b71e..fe7a828c85 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -141,8 +141,8 @@ def to_bytes(self) -> bytes: Warning ------- - Will always copy data, only use this method for small buffers such as meta- - data. If possible, use `.as_numpy_array()` or `.as_ndarray_like()` instead. + Will always copy data, only use this method for small buffers such as metadata + buffers. If possible, use `.as_numpy_array()` or `.as_ndarray_like()` instead. Return ------ @@ -289,5 +289,25 @@ def transpose(self, *axes: np.SupportsIndex) -> Self: return self.__class__(self._data.transpose(*axes)) -def as_bytes_wrapper(func: Callable[[bytes], bytes], buf: Buffer) -> Buffer: - return Buffer.from_bytes(func(buf.to_bytes())) +def as_numpy_array_wrapper(func: Callable[[np.ndarray], bytes], buf: Buffer) -> Buffer: + """Converts the input of `func` to a numpy array and the output back to `Buffer`. + + This function is useful when calling a `func` that only support host memory such + as `GZip.decode` and `Blosc.decode`. In this case, use this wrapper to convert + the input `buf` to a Numpy array and convert the result back into a `Buffer`. + + Parameters + ---------- + func + The callable that will be called with the converted `buf` as input. + `func` must return bytes, which will be converted into a `Buffer` + before returned. + buf + The buffer that will be converted to a Numpy array before given as + input to `func`. + + Return + ------ + The result of `func` converted to a `Buffer` + """ + return Buffer.from_bytes(func(buf.as_numpy_array())) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index e7b374ab1c..7334139acb 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -9,7 +9,7 @@ from numcodecs.blosc import Blosc from zarr.abc.codec import BytesBytesCodec -from zarr.buffer import Buffer, as_bytes_wrapper +from zarr.buffer import Buffer, as_numpy_array_wrapper from zarr.codecs.registry import register_codec from zarr.common import parse_enum, parse_named_configuration, to_thread @@ -163,7 +163,7 @@ async def decode( chunk_bytes: Buffer, _chunk_spec: ArraySpec, ) -> Buffer: - return await to_thread(as_bytes_wrapper, self._blosc_codec.decode, chunk_bytes) + return await to_thread(as_numpy_array_wrapper, self._blosc_codec.decode, chunk_bytes) async def encode( self, diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index cf36e8679d..a8d7f815aa 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -5,7 +5,7 @@ from numcodecs.gzip import GZip from zarr.abc.codec import BytesBytesCodec -from zarr.buffer import Buffer, as_bytes_wrapper +from zarr.buffer import Buffer, as_numpy_array_wrapper from zarr.codecs.registry import register_codec from zarr.common import parse_named_configuration, to_thread @@ -49,14 +49,14 @@ async def decode( chunk_bytes: Buffer, _chunk_spec: ArraySpec, ) -> Buffer: - return await to_thread(as_bytes_wrapper, GZip(self.level).decode, chunk_bytes) + return await to_thread(as_numpy_array_wrapper, GZip(self.level).decode, chunk_bytes) async def encode( self, chunk_bytes: Buffer, _chunk_spec: ArraySpec, ) -> Optional[Buffer]: - return await to_thread(as_bytes_wrapper, GZip(self.level).encode, chunk_bytes) + return await to_thread(as_numpy_array_wrapper, GZip(self.level).encode, chunk_bytes) def compute_encoded_size( self, diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index 2f5ee8868b..0cc99a0368 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -6,7 +6,7 @@ from zstandard import ZstdCompressor, ZstdDecompressor from zarr.abc.codec import BytesBytesCodec -from zarr.buffer import Buffer, as_bytes_wrapper +from zarr.buffer import Buffer, as_numpy_array_wrapper from zarr.codecs.registry import register_codec from zarr.common import parse_named_configuration, to_thread @@ -65,14 +65,14 @@ async def decode( chunk_bytes: Buffer, _chunk_spec: ArraySpec, ) -> Buffer: - return await to_thread(as_bytes_wrapper, self._decompress, chunk_bytes) + return await to_thread(as_numpy_array_wrapper, self._decompress, chunk_bytes) async def encode( self, chunk_bytes: Buffer, _chunk_spec: ArraySpec, ) -> Optional[Buffer]: - return await to_thread(as_bytes_wrapper, self._compress, chunk_bytes) + return await to_thread(as_numpy_array_wrapper, self._compress, chunk_bytes) def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError From 275cd6c2c3621f5f8c3430cb2634fc240e0d5405 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 17:57:56 +0200 Subject: [PATCH 40/45] fix import --- src/zarr/buffer.py | 13 ++----------- tests/v3/test_buffer.py | 6 +++++- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index fe7a828c85..60a92feba7 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -1,17 +1,8 @@ from __future__ import annotations import sys -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Iterable, - Literal, - Optional, - Protocol, - Tuple, - TypeAlias, -) +from typing import (TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, + Protocol, Tuple, TypeAlias) import numpy as np diff --git a/tests/v3/test_buffer.py b/tests/v3/test_buffer.py index edb3e40a35..c93d29f452 100644 --- a/tests/v3/test_buffer.py +++ b/tests/v3/test_buffer.py @@ -1,4 +1,5 @@ -from typing import Any, Iterable, Literal, Optional, Self +from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional + import numpy as np import numpy.typing as npt import pytest @@ -8,6 +9,9 @@ from zarr.store.core import StorePath from zarr.store.memory import MemoryStore +if TYPE_CHECKING: + from typing_extensions import Self + class MyNDArrayLike(np.ndarray): """An example of a ndarray-like class""" From 91809e546350cca23497b60868fc03844156dd39 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 14 May 2024 18:00:27 +0200 Subject: [PATCH 41/45] use from __future__ import annotations --- src/zarr/buffer.py | 13 +++++++++++-- tests/v3/test_buffer.py | 2 ++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 60a92feba7..fe7a828c85 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -1,8 +1,17 @@ from __future__ import annotations import sys -from typing import (TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, - Protocol, Tuple, TypeAlias) +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Iterable, + Literal, + Optional, + Protocol, + Tuple, + TypeAlias, +) import numpy as np diff --git a/tests/v3/test_buffer.py b/tests/v3/test_buffer.py index c93d29f452..a56c768782 100644 --- a/tests/v3/test_buffer.py +++ b/tests/v3/test_buffer.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional import numpy as np From b5eec5ddb88ee9a0b07db28da8b6a50b7db06aaa Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 15 May 2024 09:58:28 +0200 Subject: [PATCH 42/45] doc and clean up --- src/zarr/buffer.py | 177 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 155 insertions(+), 22 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index fe7a828c85..e6b75886c7 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -25,6 +25,7 @@ def check_item_key_is_1d_contiguous(key: Any) -> None: + """Raises error if `key` isn't a 1d contiguous slice""" if not isinstance(key, slice): raise TypeError( f"Item key has incorrect type (expected slice, got {key.__class__.__name__})" @@ -86,31 +87,70 @@ def __call__(self, ndarray_like: NDArrayLike) -> NDBuffer: class Buffer: """A flat contiguous memory block - We use `Buffer` throughout Zarr to represent a contiguous block of memory. - For now, we only support host memory but the plan is to support other types - of memory such as CUDA device memory. + We use Buffer throughout Zarr to represent a contiguous block of memory. + + A Buffer is backed by a underlying ndarray-like instance that represents + the memory. The memory type is unspecified; can be regular host memory, + CUDA device memory, or something else. The only requirement is that the + ndarray-like instance can be copied/converted to a regular Numpy array + (host memory). + + Note + ---- + This buffer is untyped, so all indexing and sizes are in bytes. + + Parameters + ---------- + ndarray_like + ndarray-like object that must be 1-dim, contiguous, and byte dtype. """ - def __init__(self, array: NDArrayLike): - assert array.ndim == 1 - assert array.itemsize == 1 - assert array.dtype == np.dtype("b") - self._data = array + def __init__(self, ndarray_like: NDArrayLike): + if ndarray_like.ndim != 1: + raise ValueError("ndarray_like: only 1-dim allowed") + if ndarray_like.dtype != np.dtype("b"): + raise ValueError("ndarray_like: only byte dtype allowed") + self._data = ndarray_like @classmethod def create_zero_length(cls) -> Self: + """Create an empty buffer with length zero + + Return + ------ + New empty 0-length buffer + """ return cls(np.array([], dtype="b")) @classmethod def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self: + """Create a new buffer of a ndarray-like object + + Parameters + ---------- + ndarray_like + ndarray-like object that must be 1-dim, contiguous, and byte dtype. + + Return + ------ + New buffer representing `ndarray_like` + """ return cls(ndarray_like) @classmethod - def from_bytes(cls, data: BytesLike) -> Self: - return cls.from_ndarray_like(np.frombuffer(data, dtype="b")) + def from_bytes(cls, bytes_like: BytesLike) -> Self: + """Create a new buffer of a bytes-like object (host memory) - def as_nd_buffer(self, *, dtype: np.DTypeLike) -> NDBuffer: - return NDBuffer(self._data.view(dtype=dtype)) + Parameters + ---------- + bytes_like + bytes-like object + + Return + ------ + New buffer representing `bytes_like` + """ + return cls.from_ndarray_like(np.frombuffer(bytes_like, dtype="b")) def as_ndarray_like(self) -> NDArrayLike: """Return the underlying array (host or device memory) of this buffer @@ -123,6 +163,22 @@ def as_ndarray_like(self) -> NDArrayLike: """ return self._data + def as_nd_buffer(self, *, dtype: np.DTypeLike) -> NDBuffer: + """Create a new NDBuffer from this one. + + This will never copy data. + + Parameters + ---------- + dtype + The datatype of the returned buffer (reinterpretation of the bytes) + + Return + ------ + New NDbuffer representing `self.as_ndarray_like()` + """ + return NDBuffer.from_ndarray_like(self._data.view(dtype=dtype)) + def as_numpy_array(self) -> np.ndarray: """Return the buffer as a NumPy array (host memory). @@ -134,7 +190,7 @@ def as_numpy_array(self) -> np.ndarray: ------ NumPy array of this buffer (might be a data copy) """ - return self._data + return np.asanyarray(self._data) def to_bytes(self) -> bytes: """Return the buffer as `bytes` (host memory). @@ -162,6 +218,8 @@ def __len__(self) -> int: return self._data.size def __add__(self, other: Buffer) -> Self: + """Concatenate two buffers""" + other_array = other.as_ndarray_like() assert other_array.dtype == np.dtype("b") return self.__class__(np.concatenate((self._data, other_array))) @@ -181,9 +239,26 @@ def __eq__(self, other: Any) -> bool: class NDBuffer: """A n-dimensional memory block - We use `NDBuffer` throughout Zarr to represent a n-dimensional memory block. - For now, we only support host memory but the plan is to support other types - of memory such as CUDA device memory. + We use NDBuffer throughout Zarr to represent a n-dimensional memory block. + + A NDBuffer is backed by a underlying ndarray-like instance that represents + the memory. The memory type is unspecified; can be regular host memory, + CUDA device memory, or something else. The only requirement is that the + ndarray-like instance can be copied/converted to a regular Numpy array + (host memory). + + Note + ---- + The two buffer classes Buffer and NDBuffer are very similar. In fact, + Buffer is a special case of NDBuffer where dim=1, stride=1, and dtype="b". + However, in order to use the Python's type system to differentiate between + the flat contiguous Buffer and the n-dim (non-contiguous) NDBuffer, we keep + the definition of the two classes separate. + + Parameters + ---------- + ndarray_like + ndarray-like object that is convertible to a regular Numpy array. """ def __init__(self, array: NDArrayLike): @@ -200,19 +275,64 @@ def create( order: Literal["C", "F"] = "C", fill_value: Optional[Any] = None, ) -> Self: + """Create a new buffer and its underlying ndarray-like object + + Parameters + ---------- + shape + The shape of the buffer and its underlying ndarray-like object + dtype + The datatype of the buffer and its underlying ndarray-like object + order + Whether to store multi-dimensional data in row-major (C-style) or + column-major (Fortran-style) order in memory. + fill_value + If not None, fill the new buffer with a scalar value. + + Return + ------ + New buffer representing a new ndarray_like object + + Developer Notes + --------------- + A subclass can overwrite this method to create a ndarray-like object + other then the default Numpy array. + """ ret = cls(np.empty(shape=shape, dtype=dtype, order=order)) if fill_value is not None: ret.fill(fill_value) return ret - @classmethod - def from_numpy_array(cls, array_like: np.ArrayLike) -> Self: - return cls(np.asanyarray(array_like)) - @classmethod def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self: + """Create a new buffer of a ndarray-like object + + Parameters + ---------- + ndarray_like + ndarray-like object + + Return + ------ + New buffer representing `ndarray_like` + """ return cls(ndarray_like) + @classmethod + def from_numpy_array(cls, array_like: np.ArrayLike) -> Self: + """Create a new buffer of Numpy array-like object + + Parameters + ---------- + array_like + Object that can be coerced into a Numpy array + + Return + ------ + New buffer representing `array_like` + """ + return cls.from_ndarray_like(np.asanyarray(array_like)) + def as_ndarray_like(self) -> NDArrayLike: """Return the underlying array (host or device memory) of this buffer @@ -225,7 +345,20 @@ def as_ndarray_like(self) -> NDArrayLike: return self._data def as_buffer(self) -> Buffer: - return Buffer(self._data.reshape(-1).view(dtype="b")) + """Create a new Buffer from this one. + + Warning + ------- + Copies data if the buffer is non-contiguous. + + Return + ------ + The new buffer (might be data copy) + """ + data = self._data + if not self._data.flags.contiguous: + data = np.ascontiguousarray(self._data) + return Buffer(data.reshape(-1).view(dtype="b")) # Flatten the array without copy def as_numpy_array(self) -> np.ndarray: """Return the buffer as a NumPy array (host memory). @@ -238,7 +371,7 @@ def as_numpy_array(self) -> np.ndarray: ------ NumPy array of this buffer (might be a data copy) """ - return self._data + return np.asanyarray(self._data) @property def dtype(self) -> np.dtype[Any]: From 197b9b086f5cfa1dd1bdc98b2b19f4adb0b386ba Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 15 May 2024 11:26:51 +0200 Subject: [PATCH 43/45] doc --- src/zarr/buffer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index e6b75886c7..615bc4d579 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -249,11 +249,11 @@ class NDBuffer: Note ---- - The two buffer classes Buffer and NDBuffer are very similar. In fact, - Buffer is a special case of NDBuffer where dim=1, stride=1, and dtype="b". - However, in order to use the Python's type system to differentiate between - the flat contiguous Buffer and the n-dim (non-contiguous) NDBuffer, we keep - the definition of the two classes separate. + The two buffer classes Buffer and NDBuffer are very similar. In fact, Buffer + is a special case of NDBuffer where dim=1, stride=1, and dtype="b". However, + in order to use Python's type system to differentiate between the contiguous + Buffer and the n-dim (non-contiguous) NDBuffer, we keep the definition of the + two classes separate. Parameters ---------- From b5f87f1a8d1505dbe893e477d9ce85ff744aec52 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 15 May 2024 12:22:06 +0200 Subject: [PATCH 44/45] Apply suggestions from code review Co-authored-by: Norman Rzepka --- src/zarr/buffer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 615bc4d579..6ab9b454e0 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -46,7 +46,7 @@ def __call__( ) -> NDBuffer: """Factory function to create a new NDBuffer (or subclass) - Callables implementing the `Factor.Create` protocol must create a new + Callables implementing the `Factory.Create` protocol must create a new instance of NDBuffer (or subclass) given the following parameters. Parameters @@ -70,7 +70,7 @@ class NDArrayLike(Protocol): def __call__(self, ndarray_like: NDArrayLike) -> NDBuffer: """Factory function to coerce an array into a NDBuffer (or subclass) - Callables implementing the `Factor.NDArrayLike` protocol must return + Callables implementing the `Factory.NDArrayLike` protocol must return an instance of NDBuffer (or subclass) given an ndarray-like object. Parameters From 3854becbdffab97bb7eb7acf3580b0d4a4648761 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 15 May 2024 14:40:50 +0200 Subject: [PATCH 45/45] Buffer is now backed by ArrayLike --- src/zarr/buffer.py | 50 ++++++++++++++++++++------------------ src/zarr/codecs/blosc.py | 2 +- src/zarr/codecs/crc32c_.py | 4 +-- 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 6ab9b454e0..a633cc09ec 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -20,7 +20,9 @@ from zarr.codecs.bytes import Endian from zarr.common import BytesLike -# TODO: create a protocol for the attributes we need, for now we just aliasing numpy +# TODO: create a protocol for the attributes we need, for now we alias Numpy's ndarray +# both for the array-like and ndarray-like +ArrayLike: TypeAlias = np.ndarray NDArrayLike: TypeAlias = np.ndarray @@ -89,10 +91,10 @@ class Buffer: We use Buffer throughout Zarr to represent a contiguous block of memory. - A Buffer is backed by a underlying ndarray-like instance that represents + A Buffer is backed by a underlying array-like instance that represents the memory. The memory type is unspecified; can be regular host memory, CUDA device memory, or something else. The only requirement is that the - ndarray-like instance can be copied/converted to a regular Numpy array + array-like instance can be copied/converted to a regular Numpy array (host memory). Note @@ -101,16 +103,16 @@ class Buffer: Parameters ---------- - ndarray_like - ndarray-like object that must be 1-dim, contiguous, and byte dtype. + array_like + array-like object that must be 1-dim, contiguous, and byte dtype. """ - def __init__(self, ndarray_like: NDArrayLike): - if ndarray_like.ndim != 1: - raise ValueError("ndarray_like: only 1-dim allowed") - if ndarray_like.dtype != np.dtype("b"): - raise ValueError("ndarray_like: only byte dtype allowed") - self._data = ndarray_like + def __init__(self, array_like: ArrayLike): + if array_like.ndim != 1: + raise ValueError("array_like: only 1-dim allowed") + if array_like.dtype != np.dtype("b"): + raise ValueError("array_like: only byte dtype allowed") + self._data = array_like @classmethod def create_zero_length(cls) -> Self: @@ -123,19 +125,19 @@ def create_zero_length(cls) -> Self: return cls(np.array([], dtype="b")) @classmethod - def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self: - """Create a new buffer of a ndarray-like object + def from_array_like(cls, array_like: NDArrayLike) -> Self: + """Create a new buffer of a array-like object Parameters ---------- - ndarray_like - ndarray-like object that must be 1-dim, contiguous, and byte dtype. + array_like + array-like object that must be 1-dim, contiguous, and byte dtype. Return ------ - New buffer representing `ndarray_like` + New buffer representing `array_like` """ - return cls(ndarray_like) + return cls(array_like) @classmethod def from_bytes(cls, bytes_like: BytesLike) -> Self: @@ -150,9 +152,9 @@ def from_bytes(cls, bytes_like: BytesLike) -> Self: ------ New buffer representing `bytes_like` """ - return cls.from_ndarray_like(np.frombuffer(bytes_like, dtype="b")) + return cls.from_array_like(np.frombuffer(bytes_like, dtype="b")) - def as_ndarray_like(self) -> NDArrayLike: + def as_array_like(self) -> NDArrayLike: """Return the underlying array (host or device memory) of this buffer This will never copy data. @@ -175,7 +177,7 @@ def as_nd_buffer(self, *, dtype: np.DTypeLike) -> NDBuffer: Return ------ - New NDbuffer representing `self.as_ndarray_like()` + New NDbuffer representing `self.as_array_like()` """ return NDBuffer.from_ndarray_like(self._data.view(dtype=dtype)) @@ -184,7 +186,7 @@ def as_numpy_array(self) -> np.ndarray: Warning ------- - Might have to copy data, consider using `.as_ndarray_like()` instead. + Might have to copy data, consider using `.as_array_like()` instead. Return ------ @@ -198,7 +200,7 @@ def to_bytes(self) -> bytes: Warning ------- Will always copy data, only use this method for small buffers such as metadata - buffers. If possible, use `.as_numpy_array()` or `.as_ndarray_like()` instead. + buffers. If possible, use `.as_numpy_array()` or `.as_array_like()` instead. Return ------ @@ -220,7 +222,7 @@ def __len__(self) -> int: def __add__(self, other: Buffer) -> Self: """Concatenate two buffers""" - other_array = other.as_ndarray_like() + other_array = other.as_array_like() assert other_array.dtype == np.dtype("b") return self.__class__(np.concatenate((self._data, other_array))) @@ -230,7 +232,7 @@ def __eq__(self, other: Any) -> bool: # convert the bytes to a Buffer and try again return self == self.from_bytes(other) if isinstance(other, Buffer): - return (self._data == other.as_ndarray_like()).all() + return (self._data == other.as_array_like()).all() raise ValueError( f"equal operator not supported between {self.__class__} and {other.__class__}" ) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 7334139acb..7e94575f9a 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -173,7 +173,7 @@ async def encode( # Since blosc only takes bytes, we convert the input and output of the encoding # between bytes and Buffer return await to_thread( - lambda chunk: Buffer.from_bytes(self._blosc_codec.encode(chunk.as_ndarray_like())), + lambda chunk: Buffer.from_bytes(self._blosc_codec.encode(chunk.as_array_like())), chunk_bytes, ) diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index a20dde3770..1daf512e43 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -46,7 +46,7 @@ async def decode( "Stored and computed checksum do not match. " + f"Stored: {stored_checksum!r}. Computed: {computed_checksum!r}." ) - return Buffer.from_ndarray_like(inner_bytes) + return Buffer.from_array_like(inner_bytes) async def encode( self, @@ -57,7 +57,7 @@ async def encode( # Calculate the checksum and "cast" it to a numpy array checksum = np.array([crc32c(data)], dtype=np.uint32) # Append the checksum (as bytes) to the data - return Buffer.from_ndarray_like(np.append(data, checksum.view("b"))) + return Buffer.from_array_like(np.append(data, checksum.view("b"))) def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length + 4