From 3bf3622f5c34dec4f1a829bc9b455451bb21b11c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Sep 2024 07:18:47 -0500 Subject: [PATCH 1/3] Use implicit fill values for zarr v2 --- src/zarr/codecs/pipeline.py | 14 ++++++++++++-- src/zarr/core/array.py | 2 +- src/zarr/core/metadata/v2.py | 9 ++++++++- tests/v3/test_metadata/test_v2.py | 2 +- tests/v3/test_v2.py | 11 +++++++++++ 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 6828377f97..c77c06bd1b 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -17,6 +17,7 @@ from zarr.core.common import ChunkCoords, concurrent_map from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar, is_total_slice +from zarr.core.metadata.v2 import default_fill_value from zarr.registry import register_pipeline if TYPE_CHECKING: @@ -247,7 +248,13 @@ async def read_batch( if chunk_array is not None: out[out_selection] = chunk_array else: - out[out_selection] = chunk_spec.fill_value + fill_value = chunk_spec.fill_value + + # this should maybe be version specific? + if fill_value is None: + fill_value = default_fill_value(dtype=chunk_spec.dtype) + + out[out_selection] = fill_value else: chunk_bytes_batch = await concurrent_map( [ @@ -274,7 +281,10 @@ async def read_batch( tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp else: - out[out_selection] = chunk_spec.fill_value + fill_value = chunk_spec.fill_value + if fill_value is None: + fill_value = default_fill_value(dtype=chunk_spec.dtype) + out[out_selection] = fill_value def _merge_chunk_array( self, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e1de15c747..e44f5ede6d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -376,7 +376,7 @@ async def _create_v2( chunks=chunks, order=order, dimension_separator=dimension_separator, - fill_value=0 if fill_value is None else fill_value, + fill_value=fill_value, compressor=compressor, filters=filters, attributes=attributes, diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index df7f2abaea..132a5c1283 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -62,7 +62,10 @@ def __init__( order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) filters_parsed = parse_filters(filters) - fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) + if fill_value is None: + fill_value_parsed = None + else: + fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -273,3 +276,7 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: raise ValueError(msg) from e return fill_value + + +def default_fill_value(dtype: np.dtype[Any]) -> Any: + return dtype.type(0) diff --git a/tests/v3/test_metadata/test_v2.py b/tests/v3/test_metadata/test_v2.py index 3ea702eecd..fa961e34cd 100644 --- a/tests/v3/test_metadata/test_v2.py +++ b/tests/v3/test_metadata/test_v2.py @@ -28,7 +28,7 @@ def test_parse_zarr_format_invalid(data: Any) -> None: @pytest.mark.parametrize("attributes", [None, {"foo": "bar"}]) @pytest.mark.parametrize("filters", [None, (), (numcodecs.GZip(),)]) @pytest.mark.parametrize("compressor", [None, numcodecs.GZip()]) -@pytest.mark.parametrize("fill_value", [0, 1]) +@pytest.mark.parametrize("fill_value", [None, 0, 1]) @pytest.mark.parametrize("order", ["C", "F"]) @pytest.mark.parametrize("dimension_separator", [".", "/", None]) def test_metadata_to_dict( diff --git a/tests/v3/test_v2.py b/tests/v3/test_v2.py index 943c425f54..7d751d5f53 100644 --- a/tests/v3/test_v2.py +++ b/tests/v3/test_v2.py @@ -31,6 +31,15 @@ def test_simple(store: StorePath) -> None: assert np.array_equal(data, a[:, :]) +def test_implicit_fill_value(store: StorePath) -> None: + arr = zarr.open_array(store=store, shape=(4,), fill_value=None, zarr_format=2) + assert arr.metadata.fill_value is None + assert arr.metadata.to_dict()["fill_value"] is None + result = arr[:] + expected = np.zeros(arr.shape, dtype=arr.dtype) + np.testing.assert_array_equal(result, expected) + + def test_codec_pipeline() -> None: # https://github.com/zarr-developers/zarr-python/issues/2243 store = MemoryStore(mode="w") @@ -46,3 +55,5 @@ def test_codec_pipeline() -> None: result = array[:] expected = np.ones(1) np.testing.assert_array_equal(result, expected) + + From a8caaa40ddf290361cfe7fced5a7b9c0f8afbad5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 7 Oct 2024 08:42:21 -0500 Subject: [PATCH 2/3] Fixup --- src/zarr/codecs/pipeline.py | 6 +++++- src/zarr/core/metadata/v2.py | 17 +++++++++++++---- tests/v3/test_v2.py | 2 -- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index c77c06bd1b..14d2877274 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -250,8 +250,12 @@ async def read_batch( else: fill_value = chunk_spec.fill_value - # this should maybe be version specific? if fill_value is None: + # Zarr V2 allowed `fill_value` to be null in the metadata. + # Zarr V3 requires it to be set. This has already been + # validated when decoding the metadata, but we support reading + # Zarr V2 data and need to support the case where fill_value + # is None. fill_value = default_fill_value(dtype=chunk_spec.dtype) out[out_selection] = fill_value diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index ca42cd2cca..72c0f5c28a 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -62,10 +62,7 @@ def __init__( order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) filters_parsed = parse_filters(filters) - if fill_value is None: - fill_value_parsed = None - else: - fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) + fill_value_parsed = parse_fill_value(fill_value, dtype=data_type_parsed) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -290,4 +287,16 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: def default_fill_value(dtype: np.dtype[Any]) -> Any: + """ + Get the default fill value for a type. + + Notes + ----- + This differs from :func:`parse_fill_value`, which parses a fill value + stored in the Array metadata into an in-memory value. This only gives + the default fill value for some type. + + This is useful for reading Zarr V2 arrays, which allow the fill + value to be unspecified. + """ return dtype.type(0) diff --git a/tests/v3/test_v2.py b/tests/v3/test_v2.py index 1a16980f2e..644883dd28 100644 --- a/tests/v3/test_v2.py +++ b/tests/v3/test_v2.py @@ -55,5 +55,3 @@ def test_codec_pipeline() -> None: result = array[:] expected = np.ones(1) np.testing.assert_array_equal(result, expected) - - From 8637c08cbebd6c468ce3177e8e3e9230977acac2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 7 Oct 2024 08:56:57 -0500 Subject: [PATCH 3/3] update asserts --- src/zarr/testing/strategies.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 234454e289..8494d35939 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -140,7 +140,8 @@ def arrays( ) assert isinstance(a, Array) - assert a.fill_value is not None + if a.metadata.zarr_format == 3: + assert a.fill_value is not None assert isinstance(root[array_path], Array) assert nparray.shape == a.shape assert chunks == a.chunks