From 6096a514ea729844355d1723f4e6160ee981e8cc Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 12 Sep 2024 22:15:54 +0200 Subject: [PATCH 1/4] fix: narrow JSON type, ensure compliance with it, and fix a variety of v2 metadata issues --- src/zarr/abc/metadata.py | 2 +- src/zarr/codecs/_v2.py | 11 +++--- src/zarr/codecs/blosc.py | 4 +- src/zarr/codecs/bytes.py | 2 +- src/zarr/codecs/pipeline.py | 9 ++++- src/zarr/codecs/sharding.py | 12 +++--- src/zarr/codecs/transpose.py | 2 +- src/zarr/core/array.py | 16 ++------ src/zarr/core/common.py | 6 +-- src/zarr/core/metadata/v2.py | 62 ++++++++++++++++++++----------- src/zarr/core/metadata/v3.py | 41 ++++++++++---------- tests/v3/test_metadata/test_v2.py | 8 ++-- 12 files changed, 95 insertions(+), 80 deletions(-) diff --git a/src/zarr/abc/metadata.py b/src/zarr/abc/metadata.py index d9b11af883..7ea668c891 100644 --- a/src/zarr/abc/metadata.py +++ b/src/zarr/abc/metadata.py @@ -15,7 +15,7 @@ @dataclass(frozen=True) class Metadata: - def to_dict(self) -> JSON: + def to_dict(self) -> dict[str, JSON]: """ Recursively serialize this model to a dictionary. This method inspects the fields of self and calls `x.to_dict()` for any fields that diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index eb8ec435f5..c8bc558349 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -67,7 +67,7 @@ def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) @dataclass(frozen=True) class V2Filters(ArrayArrayCodec): - filters: list[dict[str, JSON]] + filters: tuple[numcodecs.abc.Codec, ...] | None is_fixed_size = False @@ -79,8 +79,7 @@ async def _decode_single( chunk_ndarray = chunk_array.as_ndarray_like() # apply filters in reverse order if self.filters is not None: - for filter_metadata in self.filters[::-1]: - filter = numcodecs.get_codec(filter_metadata) + for filter in self.filters[::-1]: chunk_ndarray = await to_thread(filter.decode, chunk_ndarray) # ensure correct chunk shape @@ -99,9 +98,9 @@ async def _encode_single( ) -> NDBuffer | None: chunk_ndarray = chunk_array.as_ndarray_like().ravel(order=chunk_spec.order) - for filter_metadata in self.filters: - filter = numcodecs.get_codec(filter_metadata) - chunk_ndarray = await to_thread(filter.encode, chunk_ndarray) + if self.filters is not None: + for filter in self.filters: + chunk_ndarray = await to_thread(filter.encode, chunk_ndarray) return get_ndbuffer_class().from_ndarray_like(chunk_ndarray) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index f831dc960d..7b10d91a6a 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -127,9 +127,9 @@ def to_dict(self) -> dict[str, JSON]: "name": "blosc", "configuration": { "typesize": self.typesize, - "cname": self.cname, + "cname": self.cname.value, "clevel": self.clevel, - "shuffle": self.shuffle, + "shuffle": self.shuffle.value, "blocksize": self.blocksize, }, } diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index bc3207be2e..7a683411e9 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -53,7 +53,7 @@ def to_dict(self) -> dict[str, JSON]: if self.endian is None: return {"name": "bytes"} else: - return {"name": "bytes", "configuration": {"endian": self.endian}} + return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if array_spec.dtype.itemsize == 0: diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 3a400d6eaf..a1a4dbaab1 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -84,8 +84,13 @@ def from_dict(cls, data: Iterable[JSON | Codec], *, batch_size: int | None = Non out.append(get_codec_class(name_parsed).from_dict(c)) # type: ignore[arg-type] return cls.from_list(out, batch_size=batch_size) - def to_dict(self) -> JSON: - return [c.to_dict() for c in self] + def to_dict(self) -> dict[str, JSON]: + return { + "array_array_codecs": tuple(c.to_dict() for c in self.array_array_codecs), + "array_bytes_codec": self.array_bytes_codec.to_dict(), + "bytes_bytes_codec": tuple(c.to_dict() for c in self.bytes_bytes_codecs), + "batch_size": self.batch_size, + } def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return type(self).from_list([c.evolve_from_array_spec(array_spec=array_spec) for c in self]) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index df7f5978a7..6f9df65692 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -68,7 +68,7 @@ class ShardingCodecIndexLocation(Enum): end = "end" -def parse_index_location(data: JSON) -> ShardingCodecIndexLocation: +def parse_index_location(data: object) -> ShardingCodecIndexLocation: return parse_enum(data, ShardingCodecIndexLocation) @@ -333,7 +333,7 @@ def __init__( chunk_shape: ChunkCoordsLike, codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),), index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()), - index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end, + index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end, ) -> None: chunk_shape_parsed = parse_shapelike(chunk_shape) codecs_parsed = parse_codecs(codecs) @@ -379,10 +379,10 @@ def to_dict(self) -> dict[str, JSON]: return { "name": "sharding_indexed", "configuration": { - "chunk_shape": list(self.chunk_shape), - "codecs": [s.to_dict() for s in self.codecs], - "index_codecs": [s.to_dict() for s in self.index_codecs], - "index_location": self.index_location, + "chunk_shape": self.chunk_shape, + "codecs": tuple([s.to_dict() for s in self.codecs]), + "index_codecs": tuple([s.to_dict() for s in self.index_codecs]), + "index_location": self.index_location.value, }, } diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 9bb795a3a1..45eb5bbe5f 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -45,7 +45,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: return cls(**configuration_parsed) # type: ignore[arg-type] def to_dict(self) -> dict[str, JSON]: - return {"name": "transpose", "configuration": {"order": list(self.order)}} + return {"name": "transpose", "configuration": {"order": tuple(self.order)}} def validate(self, shape: tuple[int, ...], dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: if len(self.order) != len(shape): diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 7311b6eec2..b16daba0e2 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -90,7 +90,7 @@ def create_codec_pipeline(metadata: ArrayV2Metadata | ArrayV3Metadata) -> CodecP return get_pipeline_class().from_list(metadata.codecs) elif isinstance(metadata, ArrayV2Metadata): return get_pipeline_class().from_list( - [V2Filters(metadata.filters or []), V2Compressor(metadata.compressor)] + [V2Filters(metadata.filters or ()), V2Compressor(metadata.compressor)] ) else: raise TypeError @@ -299,8 +299,6 @@ async def _create_v2( attributes: dict[str, JSON] | None = None, exists_ok: bool = False, ) -> AsyncArray: - import numcodecs - if not exists_ok: await ensure_no_existing_node(store_path, zarr_format=2) if order is None: @@ -315,15 +313,9 @@ async def _create_v2( chunks=chunks, order=order, dimension_separator=dimension_separator, - fill_value=0 if fill_value is None else fill_value, - compressor=( - numcodecs.get_codec(compressor).get_config() if compressor is not None else None - ), - filters=( - [numcodecs.get_codec(filter).get_config() for filter in filters] - if filters is not None - else None - ), + fill_value=fill_value, + compressor=compressor, + filters=filters, attributes=attributes, ) array = cls(metadata=metadata, store_path=store_path) diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 99ab58fae9..906467005f 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -4,7 +4,7 @@ import contextvars import functools import operator -from collections.abc import Iterable +from collections.abc import Iterable, Mapping from enum import Enum from typing import ( TYPE_CHECKING, @@ -32,7 +32,7 @@ ChunkCoords = tuple[int, ...] ChunkCoordsLike = Iterable[int] ZarrFormat = Literal[2, 3] -JSON = None | str | int | float | Enum | dict[str, "JSON"] | list["JSON"] | tuple["JSON", ...] +JSON = None | str | int | float | Mapping[str, "JSON"] | tuple["JSON", ...] MemoryOrder = Literal["C", "F"] AccessModeLiteral = Literal["r", "r+", "a", "w", "w-"] @@ -80,7 +80,7 @@ def enum_names(enum: type[E]) -> Iterator[str]: yield item.name -def parse_enum(data: JSON, cls: type[E]) -> E: +def parse_enum(data: object, cls: type[E]) -> E: if isinstance(data, cls): return data if not isinstance(data, str): diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 6d5ecd7e86..af7821bea7 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections.abc import Iterable from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -14,6 +15,7 @@ import json from dataclasses import dataclass, field, replace +import numcodecs import numpy as np from zarr.core.array_spec import ArraySpec @@ -31,9 +33,9 @@ class ArrayV2Metadata(ArrayMetadata): data_type: np.dtype[Any] fill_value: None | int | float = 0 order: Literal["C", "F"] = "C" - filters: list[dict[str, JSON]] | None = None + filters: tuple[numcodecs.abc.Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: dict[str, JSON] | None = None + compressor: numcodecs.abc.Codec | None = None attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -46,8 +48,8 @@ def __init__( fill_value: Any, order: Literal["C", "F"], dimension_separator: Literal[".", "/"] = ".", - compressor: dict[str, JSON] | None = None, - filters: list[dict[str, JSON]] | None = None, + compressor: numcodecs.abc.Codec | dict[str, JSON] | None = None, + filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None, attributes: dict[str, JSON] | None = None, ): """ @@ -104,11 +106,6 @@ def _json_convert( raise TypeError zarray_dict = self.to_dict() - - # todo: remove this check when we can ensure that to_dict always returns dicts. - if not isinstance(zarray_dict, dict): - raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.") - zattrs_dict = zarray_dict.pop("attributes", {}) json_indent = config.get("json_indent") return { @@ -128,13 +125,8 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _ = parse_zarr_format(_data.pop("zarr_format")) return cls(**_data) - def to_dict(self) -> JSON: + def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - - # todo: remove this check when we can ensure that to_dict always returns dicts. - if not isinstance(zarray_dict, dict): - raise TypeError(f"Invalid type: got {type(zarray_dict)}, expected dict.") - _ = zarray_dict.pop("chunk_grid") zarray_dict["chunks"] = self.chunk_grid.chunk_shape @@ -165,18 +157,44 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) -def parse_zarr_format(data: Literal[2]) -> Literal[2]: +def parse_zarr_format(data: object) -> Literal[2]: if data == 2: - return data + return 2 raise ValueError(f"Invalid value. Expected 2. Got {data}.") -def parse_filters(data: list[dict[str, JSON]] | None) -> list[dict[str, JSON]] | None: - return data +def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: + """ + Parse a potential tuple of filters + """ + out: list[numcodecs.abc.Codec] = [] + if data is None: + return data + if isinstance(data, Iterable): + for idx, val in enumerate(data): + if isinstance(val, numcodecs.abc.Codec): + out.append(val) + elif isinstance(val, dict): + out.append(numcodecs.get_codec(val)) + else: + msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." + raise TypeError(msg) + return tuple(out) + msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." + raise TypeError(msg) -def parse_compressor(data: dict[str, JSON] | None) -> dict[str, JSON] | None: - return data + +def parse_compressor(data: object) -> numcodecs.abc.Codec | None: + """ + Parse a potential compressor. + """ + if data is None or isinstance(data, numcodecs.abc.Codec): + return data + if isinstance(data, dict): + return numcodecs.get_codec(data) + msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." + raise ValueError(msg) def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: @@ -189,7 +207,7 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: return data -def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: +def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: """ Parse a potential fill value into a value that is compatible with the provided dtype. diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 195c3bd0a2..068a079f76 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -30,19 +30,19 @@ from zarr.registry import get_codec_class, get_pipeline_class -def parse_zarr_format(data: Literal[3]) -> Literal[3]: +def parse_zarr_format(data: object) -> Literal[3]: if data == 3: - return data + return 3 raise ValueError(f"Invalid value. Expected 3. Got {data}.") -def parse_node_type_array(data: Literal["array"]) -> Literal["array"]: +def parse_node_type_array(data: object) -> Literal["array"]: if data == "array": - return data + return "array" raise ValueError(f"Invalid value. Expected 'array'. Got {data}.") -def parse_codecs(data: Iterable[Codec | dict[str, JSON]]) -> tuple[Codec, ...]: +def parse_codecs(data: object) -> tuple[Codec, ...]: out: tuple[Codec, ...] = () if not isinstance(data, Iterable): @@ -60,10 +60,10 @@ def parse_codecs(data: Iterable[Codec | dict[str, JSON]]) -> tuple[Codec, ...]: return out -def parse_dimension_names(data: None | Iterable[str | None]) -> tuple[str | None, ...] | None: +def parse_dimension_names(data: object) -> tuple[str | None, ...] | None: if data is None: return data - elif all(isinstance(x, type(None) | str) for x in data): + elif isinstance(data, Iterable) and all(isinstance(x, type(None) | str) for x in data): return tuple(data) else: msg = f"Expected either None or a iterable of str, got {type(data)}" @@ -169,7 +169,7 @@ def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: return self.chunk_key_encoding.encode_chunk_key(chunk_coords) def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: - def _json_convert(o: Any) -> Any: + def _json_convert(o: object) -> Any: if isinstance(o, np.dtype): return str(o) if np.isscalar(o): @@ -206,14 +206,14 @@ def _json_convert(o: Any) -> Any: } @classmethod - def from_dict(cls, data: dict[str, JSON]) -> ArrayV3Metadata: + def from_dict(cls, data: dict[str, JSON]) -> Self: # make a copy because we are modifying the dict _data = data.copy() - # TODO: Remove the type: ignores[] comments below and use a TypedDict to type `data` + # check that the zarr_format attribute is correct - _ = parse_zarr_format(_data.pop("zarr_format")) # type: ignore[arg-type] + _ = parse_zarr_format(_data.pop("zarr_format")) # check that the node_type attribute is correct - _ = parse_node_type_array(_data.pop("node_type")) # type: ignore[arg-type] + _ = parse_node_type_array(_data.pop("node_type")) # dimension_names key is optional, normalize missing to `None` _data["dimension_names"] = _data.pop("dimension_names", None) @@ -221,7 +221,7 @@ def from_dict(cls, data: dict[str, JSON]) -> ArrayV3Metadata: _data["attributes"] = _data.pop("attributes", None) return cls(**_data) # type: ignore[arg-type] - def to_dict(self) -> dict[str, Any]: + def to_dict(self) -> dict[str, JSON]: out_dict = super().to_dict() if not isinstance(out_dict, dict): @@ -266,23 +266,23 @@ def create_pipeline(data: Iterable[Codec | JSON]) -> CodecPipeline: @overload -def parse_fill_value(fill_value: Any, dtype: BOOL_DTYPE) -> BOOL: ... +def parse_fill_value(fill_value: object, dtype: BOOL_DTYPE) -> BOOL: ... @overload -def parse_fill_value(fill_value: Any, dtype: INTEGER_DTYPE) -> INTEGER: ... +def parse_fill_value(fill_value: object, dtype: INTEGER_DTYPE) -> INTEGER: ... @overload -def parse_fill_value(fill_value: Any, dtype: FLOAT_DTYPE) -> FLOAT: ... +def parse_fill_value(fill_value: object, dtype: FLOAT_DTYPE) -> FLOAT: ... @overload -def parse_fill_value(fill_value: Any, dtype: COMPLEX_DTYPE) -> COMPLEX: ... +def parse_fill_value(fill_value: object, dtype: COMPLEX_DTYPE) -> COMPLEX: ... @overload -def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: +def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: # This dtype[Any] is unfortunately necessary right now. # See https://github.com/zarr-developers/zarr-python/issues/2131#issuecomment-2318010899 # for more details, but `dtype` here (which comes from `parse_dtype`) @@ -294,7 +294,8 @@ def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: def parse_fill_value( - fill_value: Any, dtype: BOOL_DTYPE | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | np.dtype[Any] + fill_value: object, + dtype: BOOL_DTYPE | INTEGER_DTYPE | FLOAT_DTYPE | COMPLEX_DTYPE | np.dtype[Any], ) -> BOOL | INTEGER | FLOAT | COMPLEX | Any: """ Parse `fill_value`, a potential fill value, into an instance of `dtype`, a data type. @@ -333,7 +334,7 @@ def parse_fill_value( raise ValueError(msg) msg = f"Cannot parse non-string sequence {fill_value} as a scalar with type {dtype}." raise TypeError(msg) - return dtype.type(fill_value) + return dtype.type(fill_value) # type: ignore[arg-type] # For type checking diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py index 4465a86471..3ea702eecd 100644 --- a/tests/v3/test_metadata/test_v2.py +++ b/tests/v3/test_metadata/test_v2.py @@ -9,9 +9,9 @@ from zarr.abc.codec import Codec +import numcodecs import pytest -from zarr.codecs import GzipCodec from zarr.core.metadata.v2 import parse_zarr_format @@ -26,14 +26,14 @@ def test_parse_zarr_format_invalid(data: Any) -> None: @pytest.mark.parametrize("attributes", [None, {"foo": "bar"}]) -@pytest.mark.parametrize("filters", [(), (GzipCodec().to_dict())]) -@pytest.mark.parametrize("compressor", [None, GzipCodec().to_dict()]) +@pytest.mark.parametrize("filters", [None, (), (numcodecs.GZip(),)]) +@pytest.mark.parametrize("compressor", [None, numcodecs.GZip()]) @pytest.mark.parametrize("fill_value", [0, 1]) @pytest.mark.parametrize("order", ["C", "F"]) @pytest.mark.parametrize("dimension_separator", [".", "/", None]) def test_metadata_to_dict( compressor: Codec | None, - filters: list[Codec] | None, + filters: tuple[Codec] | None, fill_value: Any, order: Literal["C", "F"], dimension_separator: Literal[".", "/"] | None, From 7984ea7fb873cae7a3126cf293445b6b696b35db Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 12 Sep 2024 22:48:40 +0200 Subject: [PATCH 2/4] remove unneeded conditional --- src/zarr/core/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index b16daba0e2..aaa8d4047d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -90,7 +90,7 @@ def create_codec_pipeline(metadata: ArrayV2Metadata | ArrayV3Metadata) -> CodecP return get_pipeline_class().from_list(metadata.codecs) elif isinstance(metadata, ArrayV2Metadata): return get_pipeline_class().from_list( - [V2Filters(metadata.filters or ()), V2Compressor(metadata.compressor)] + [V2Filters(metadata.filters), V2Compressor(metadata.compressor)] ) else: raise TypeError From fdffbb81226515a5bfd3d74422f0c51414a3eaf8 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 13 Sep 2024 11:53:56 +0200 Subject: [PATCH 3/4] codecpipeline no longer inherits from metadata, ditches to_dict and from_dict methods --- src/zarr/abc/codec.py | 12 +----------- src/zarr/codecs/pipeline.py | 31 ++++--------------------------- src/zarr/core/metadata/v3.py | 10 ++-------- 3 files changed, 7 insertions(+), 46 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index cc32b9bcfc..d74e92464c 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -17,7 +17,6 @@ from zarr.abc.store import ByteGetter, ByteSetter from zarr.core.array_spec import ArraySpec from zarr.core.chunk_grids import ChunkGrid - from zarr.core.common import JSON from zarr.core.indexing import SelectorTuple __all__ = [ @@ -242,7 +241,7 @@ async def encode_partial( ) -class CodecPipeline(Metadata): +class CodecPipeline: """Base class for implementing CodecPipeline. A CodecPipeline implements the read and write paths for chunk data. On the read path, it is responsible for fetching chunks from a store (via ByteGetter), @@ -402,15 +401,6 @@ async def write( """ ... - @classmethod - def from_dict(cls, data: Iterable[JSON | Codec]) -> Self: - """ - Create an instance of the model from a dictionary - """ - ... - - return cls(**data) - async def _batching_helper( func: Callable[[CodecInput, ArraySpec], Awaitable[CodecOutput | None]], diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index a1a4dbaab1..8d3e354c5e 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -1,6 +1,5 @@ from __future__ import annotations -from collections.abc import Iterable, Iterator from dataclasses import dataclass from itertools import islice, pairwise from typing import TYPE_CHECKING, Any, TypeVar @@ -15,12 +14,14 @@ Codec, CodecPipeline, ) -from zarr.core.common import JSON, ChunkCoords, concurrent_map, parse_named_configuration +from zarr.core.common import ChunkCoords, concurrent_map from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar, is_total_slice -from zarr.registry import get_codec_class, register_pipeline +from zarr.registry import register_pipeline if TYPE_CHECKING: + from collections.abc import Iterable, Iterator + import numpy as np from typing_extensions import Self @@ -68,30 +69,6 @@ class BatchedCodecPipeline(CodecPipeline): bytes_bytes_codecs: tuple[BytesBytesCodec, ...] batch_size: int - @classmethod - def from_dict(cls, data: Iterable[JSON | Codec], *, batch_size: int | None = None) -> Self: - out: list[Codec] = [] - if not isinstance(data, Iterable): - raise TypeError(f"Expected iterable, got {type(data)}") - - for c in data: - if isinstance( - c, ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec - ): # Can't use Codec here because of mypy limitation - out.append(c) - else: - name_parsed, _ = parse_named_configuration(c, require_configuration=False) - out.append(get_codec_class(name_parsed).from_dict(c)) # type: ignore[arg-type] - return cls.from_list(out, batch_size=batch_size) - - def to_dict(self) -> dict[str, JSON]: - return { - "array_array_codecs": tuple(c.to_dict() for c in self.array_array_codecs), - "array_bytes_codec": self.array_bytes_codec.to_dict(), - "bytes_bytes_codec": tuple(c.to_dict() for c in self.bytes_bytes_codecs), - "batch_size": self.batch_size, - } - def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return type(self).from_list([c.evolve_from_array_spec(array_spec=array_spec) for c in self]) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 068a079f76..10047cbb93 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -19,7 +19,7 @@ import numcodecs.abc import numpy as np -from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec, CodecPipeline +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.core.array_spec import ArraySpec from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid @@ -27,7 +27,7 @@ from zarr.core.common import ZARR_JSON, parse_dtype, parse_named_configuration, parse_shapelike from zarr.core.config import config from zarr.core.metadata.common import ArrayMetadata, parse_attributes -from zarr.registry import get_codec_class, get_pipeline_class +from zarr.registry import get_codec_class def parse_zarr_format(data: object) -> Literal[3]: @@ -240,12 +240,6 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: return replace(self, attributes=attributes) -def create_pipeline(data: Iterable[Codec | JSON]) -> CodecPipeline: - if not isinstance(data, Iterable): - raise TypeError(f"Expected iterable, got {type(data)}") - return get_pipeline_class().from_dict(data) - - BOOL = np.bool_ BOOL_DTYPE = np.dtypes.BoolDType INTEGER_DTYPE = ( From 14db760c02206613ed7d3e38f2893e858876cd87 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 13 Sep 2024 11:55:06 +0200 Subject: [PATCH 4/4] rename from_list to from_codecs --- src/zarr/abc/codec.py | 6 +++--- src/zarr/codecs/pipeline.py | 4 ++-- src/zarr/codecs/sharding.py | 8 ++++---- src/zarr/core/array.py | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index d74e92464c..2098d989e9 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -265,12 +265,12 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: @classmethod @abstractmethod - def from_list(cls, codecs: Iterable[Codec]) -> Self: - """Creates a codec pipeline from a list of codecs. + def from_codecs(cls, codecs: Iterable[Codec]) -> Self: + """Creates a codec pipeline from an iterable of codecs. Parameters ---------- - codecs : list[Codec] + codecs : Iterable[Codec] Returns ------- diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 8d3e354c5e..182621c59f 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -70,10 +70,10 @@ class BatchedCodecPipeline(CodecPipeline): batch_size: int def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - return type(self).from_list([c.evolve_from_array_spec(array_spec=array_spec) for c in self]) + return type(self).from_codecs(c.evolve_from_array_spec(array_spec=array_spec) for c in self) @classmethod - def from_list(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: + def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) return cls( diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 6f9df65692..3ae51ce54b 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -373,7 +373,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: @property def codec_pipeline(self) -> CodecPipeline: - return get_pipeline_class().from_list(self.codecs) + return get_pipeline_class().from_codecs(self.codecs) def to_dict(self) -> dict[str, JSON]: return { @@ -620,7 +620,7 @@ async def _decode_shard_index( index_array = next( iter( await get_pipeline_class() - .from_list(self.index_codecs) + .from_codecs(self.index_codecs) .decode( [(index_bytes, self._get_index_chunk_spec(chunks_per_shard))], ) @@ -633,7 +633,7 @@ async def _encode_shard_index(self, index: _ShardIndex) -> Buffer: index_bytes = next( iter( await get_pipeline_class() - .from_list(self.index_codecs) + .from_codecs(self.index_codecs) .encode( [ ( @@ -651,7 +651,7 @@ async def _encode_shard_index(self, index: _ShardIndex) -> Buffer: def _shard_index_size(self, chunks_per_shard: ChunkCoords) -> int: return ( get_pipeline_class() - .from_list(self.index_codecs) + .from_codecs(self.index_codecs) .compute_encoded_size( 16 * product(chunks_per_shard), self._get_index_chunk_spec(chunks_per_shard) ) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index aaa8d4047d..3a455b239f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -87,9 +87,9 @@ def parse_array_metadata(data: Any) -> ArrayV2Metadata | ArrayV3Metadata: def create_codec_pipeline(metadata: ArrayV2Metadata | ArrayV3Metadata) -> CodecPipeline: if isinstance(metadata, ArrayV3Metadata): - return get_pipeline_class().from_list(metadata.codecs) + return get_pipeline_class().from_codecs(metadata.codecs) elif isinstance(metadata, ArrayV2Metadata): - return get_pipeline_class().from_list( + return get_pipeline_class().from_codecs( [V2Filters(metadata.filters), V2Compressor(metadata.compressor)] ) else: