From bc7ced01e6f95c549fbb5b52eb13bae5a95bec39 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 14 Mar 2024 02:24:18 -0400 Subject: [PATCH 1/3] WIP towards full element-wise comparison --- virtualizarr/manifests/array.py | 71 +++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 6e2dca8c..9cc41bc2 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -6,7 +6,13 @@ from ..kerchunk import KerchunkArrRefs from ..zarr import Codec, ZArray -from .manifest import _CHUNK_KEY, ChunkManifest, concat_manifests, stack_manifests +from .manifest import ( + _CHUNK_KEY, + ChunkManifest, + concat_manifests, + get_ndim_from_key, + stack_manifests, +) MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS: Dict[ str, Callable @@ -165,7 +171,7 @@ def __eq__( # type: ignore[override] """ Element-wise equality checking. - Returns a numpy array of booleans. + Returns a numpy array of booleans, with elements that are True iff the manifests' ChunkEntry that this element would reside in is identical between the two arrays. """ if isinstance(other, (int, float, bool)): # TODO what should this do when comparing against numpy arrays? @@ -181,19 +187,35 @@ def __eq__( # type: ignore[override] if self.zarray != other.zarray: return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool)) else: - if self.manifest == other.manifest: - return np.full(shape=self.shape, fill_value=True, dtype=np.dtype(bool)) - else: - # TODO this doesn't yet do what it should - it simply returns all False if any of the chunk entries are different. - # What it should do is return True for the locations where the chunk entries are the same. - warnings.warn( - "__eq__ currently is over-cautious, returning an array of all False if any of the chunk entries don't match.", - UserWarning, + # do full element-wise comparison + + # do chunk-wise comparison + boolean_chunk_dict = { + key: entry1 == entry2 + for key, entry1, entry2 in zip( + self.manifest.entries.keys(), + self.manifest.entries.values(), + other.manifest.entries.values(), ) + } - # TODO do chunk-wise comparison - # TODO expand it into an element-wise result - return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool)) + # replace per-chunk booleans with numpy arrays of booleans of the shape of each chunk + array_boolean_chunk_dict = { + key: np.full( + shape=self.chunks, fill_value=bool_val, dtype=np.dtype(bool) + ) + for key, bool_val in boolean_chunk_dict + } + + # assemble chunk-wise boolean blocks into an n-dimensional nested list + nested_list = _nested_list_from_chunk_keys(array_boolean_chunk_dict) + + # assemble into the full result + result = np.block(nested_list) + + # trim off any extra elements due to the final zarr chunk potentially having a different size + indexer = tuple([slice(None, length) for length in self.shape]) + return result[indexer] def __getitem__( self, @@ -230,6 +252,29 @@ def __getitem__( raise NotImplementedError(f"Doesn't support slicing with {indexer}") +# Define type for arbitrarily-nested list of lists recursively: +OBJECT_LIST_HYPERCUBE = Union[Any, List["OBJECT_LIST_HYPERCUBE"]] + + +def _nested_list_from_chunk_keys(chunk_dict: dict[str, Any]) -> OBJECT_LIST_HYPERCUBE: + """Takes a mapping of chunk keys to values and returns an n-dimensional nested list containing those values in order.""" + + first_key, *other_keys = chunk_dict.keys() + ndim = get_ndim_from_key(first_key) + + _chunk_dict = chunk_dict + for _ in range(ndim): + _chunk_dict = _stack_along_final_dim(_chunk_dict) + + nested_list = _chunk_dict[""] + return nested_list + + +def _stack_along_final_dim(d: dict[str, Any]) -> OBJECT_LIST_HYPERCUBE: + *other_key_indices, order = split(key) + order = + + def _possibly_expand_trailing_ellipsis(key, ndim: int): if key[-1] == ...: extra_slices_needed = ndim - (len(key) - 1) From 1a1034aaaa7b8b691c18f99ae57a3afb40c9ca4a Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 29 Jul 2024 04:51:46 -0700 Subject: [PATCH 2/3] test partial equality checking --- .../tests/test_manifests/test_array.py | 43 ++++++++++++++++++- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 6d5ede79..c21c9763 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -119,8 +119,47 @@ def test_not_equal_chunk_entries(self): marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) assert not (marr1 == marr2).all() - @pytest.mark.skip(reason="Not Implemented") - def test_partly_equals(self): ... + def test_partly_equals(self): + # both manifest arrays in this example have the same zarray properties + zarray = ZArray( + chunks=(5, 1, 10), + compressor={"id": "zlib", "level": 1}, + dtype=np.dtype("int32"), + fill_value=0.0, + filters=None, + order="C", + shape=(5, 1, 20), + zarr_format=2, + ) + + chunks_dict1 = { + "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, + "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100}, + } + manifest1 = ChunkManifest(entries=chunks_dict1) + marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) + + chunks_dict2 = { + "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, + "0.0.1": {"path": "bar.nc", "offset": 200, "length": 100}, + } + manifest2 = ChunkManifest(entries=chunks_dict2) + marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) + + expected = np.concatenate( + [ + np.repeat(True, np.prod(marr1.zarray.chunks)).reshape( + marr1.zarray.chunks + ), + np.repeat(False, np.prod(marr1.zarray.chunks)).reshape( + marr1.zarray.chunks + ), + ], + axis=2, + ) + + comparison = marr1 == marr2 + assert (comparison == expected).all() class TestBroadcast: From 7044737e7a037505076e12e812df9b09887dc216 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 29 Jul 2024 04:52:00 -0700 Subject: [PATCH 3/3] test passing --- virtualizarr/manifests/array.py | 55 +++++++++++++-------------------- 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 9688877e..a6729546 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -1,4 +1,3 @@ -import warnings from typing import Any, Callable, Union import numpy as np @@ -143,7 +142,7 @@ def __eq__( # type: ignore[override] """ Element-wise equality checking. - Returns a numpy array of booleans, with elements that are True iff the manifests' ChunkEntry that this element would reside in is identical between the two arrays. + Returns a numpy array of booleans, with elements that are True iff the manifests' chunk that this element would reside in is identical between the two arrays. """ if isinstance(other, (int, float, bool, np.ndarray)): # TODO what should this do when comparing against numpy arrays? @@ -164,17 +163,28 @@ def __eq__( # type: ignore[override] equal_chunk_offsets = self.manifest._offsets == other.manifest._offsets equal_chunk_lengths = self.manifest._lengths == other.manifest._lengths - equal_chunks = ( - equal_chunk_paths & equal_chunk_offsets & equal_chunk_lengths - ) + equal_chunks = equal_chunk_paths & equal_chunk_offsets & equal_chunk_lengths + + def generate_boolean_subarrays( + boolean_element: bool, chunks: tuple[int, ...] + ) -> np.ndarray[Any, np.dtype[np.bool]]: # type: ignore[name-defined] + """If a chunk is not equivalent then all elements in it are considered not equal, so repeat the boolean result.""" + return ( + np.repeat(boolean_element, repeats=np.prod(chunks)) + .reshape(chunks) + .astype(bool) # type: ignore[attr-defined] # TODO do we need this? + ) - if not equal_chunks.all(): - # TODO expand chunk-wise comparison into an element-wise result instead of just returning all False - return np.full( - shape=self.shape, fill_value=False, dtype=np.dtype(bool) + # Replace every chunk in the manifest with the new sub-array of booleans + # Create a nested list of subarrays using np.ndindex for general N-dimensional support + boolean_subarrays = np.empty(self.manifest.shape_chunk_grid, dtype=object) + for idx in np.ndindex(self.manifest.shape_chunk_grid): + boolean_subarrays[idx] = generate_boolean_subarrays( + equal_chunks[idx], self.chunks ) - else: - raise RuntimeWarning("Should not be possible to get here") + + # Use np.block to assemble the subarrays into boolean array of same shape as original ManifestArray + return np.block(boolean_subarrays.tolist()) def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray": """Cannot change the dtype, but needed because xarray will call this even when it's a no-op.""" @@ -257,29 +267,6 @@ def rename_paths( return ManifestArray(zarray=self.zarray, chunkmanifest=renamed_manifest) -# Define type for arbitrarily-nested list of lists recursively: -OBJECT_LIST_HYPERCUBE = Union[Any, List["OBJECT_LIST_HYPERCUBE"]] - - -def _nested_list_from_chunk_keys(chunk_dict: dict[str, Any]) -> OBJECT_LIST_HYPERCUBE: - """Takes a mapping of chunk keys to values and returns an n-dimensional nested list containing those values in order.""" - - first_key, *other_keys = chunk_dict.keys() - ndim = get_ndim_from_key(first_key) - - _chunk_dict = chunk_dict - for _ in range(ndim): - _chunk_dict = _stack_along_final_dim(_chunk_dict) - - nested_list = _chunk_dict[""] - return nested_list - - -def _stack_along_final_dim(d: dict[str, Any]) -> OBJECT_LIST_HYPERCUBE: - *other_key_indices, order = split(key) - order = - - def _possibly_expand_trailing_ellipsis(key, ndim: int): if key[-1] == ...: extra_slices_needed = ndim - (len(key) - 1)