rhugonnet · rhugonnet · May 22, 2024 · May 21, 2024 · May 22, 2024 · May 22, 2024
diff --git a/geoutils/raster/delayed.py b/geoutils/raster/delayed.py
@@ -1,6 +1,7 @@
 """
 Module for dask-delayed functions for out-of-memory raster operations.
 """
+
 from __future__ import annotations
 
 import warnings
@@ -15,7 +16,7 @@
 from dask.utils import cached_cumsum
 from scipy.interpolate import interpn
 
-from geoutils._typing import NDArrayNum
+from geoutils._typing import NDArrayBool, NDArrayNum
 from geoutils.projtools import _get_bounds_projected, _get_footprint_projected
 
 # 1/ SUBSAMPLING
@@ -96,28 +97,37 @@ def _get_indices_block_per_subsample(
 
 
 @dask.delayed  # type: ignore
-def _delayed_nb_valids(arr_chunk: NDArrayNum) -> NDArrayNum:
+def _delayed_nb_valids(arr_chunk: NDArrayNum | NDArrayBool) -> NDArrayNum:
     """Count number of valid values per block."""
+    if arr_chunk.dtype == "bool":
+        return np.array([np.count_nonzero(arr_chunk)]).reshape((1, 1))
     return np.array([np.count_nonzero(np.isfinite(arr_chunk))]).reshape((1, 1))
 
 
 @dask.delayed  # type: ignore
-def _delayed_subsample_block(arr_chunk: NDArrayNum, subsample_indices: NDArrayNum) -> NDArrayNum:
+def _delayed_subsample_block(
+    arr_chunk: NDArrayNum | NDArrayBool, subsample_indices: NDArrayNum
+) -> NDArrayNum | NDArrayBool:
     """Subsample the valid values at the corresponding 1D valid indices per block."""
 
-    s_chunk = arr_chunk[np.isfinite(arr_chunk)][subsample_indices]
-
-    return s_chunk
+    if arr_chunk.dtype == "bool":
+        return arr_chunk[arr_chunk][subsample_indices]
+    return arr_chunk[np.isfinite(arr_chunk)][subsample_indices]
 
 
 @dask.delayed  # type: ignore
 def _delayed_subsample_indices_block(
-    arr_chunk: NDArrayNum, subsample_indices: NDArrayNum, block_id: dict[str, Any]
+    arr_chunk: NDArrayNum | NDArrayBool, subsample_indices: NDArrayNum, block_id: dict[str, Any]
 ) -> NDArrayNum:
     """Return 2D indices from the subsampled 1D valid indices per block."""
 
-    # Unravel indices of valid data to the shape of the block
-    ix, iy = np.unravel_index(np.argwhere(np.isfinite(arr_chunk.flatten()))[subsample_indices], shape=arr_chunk.shape)
+    if arr_chunk.dtype == "bool":
+        ix, iy = np.unravel_index(np.argwhere(arr_chunk.flatten())[subsample_indices], shape=arr_chunk.shape)
+    else:
+        #  Unravel indices of valid data to the shape of the block
+        ix, iy = np.unravel_index(
+            np.argwhere(np.isfinite(arr_chunk.flatten()))[subsample_indices], shape=arr_chunk.shape
+        )
 
     # Convert to full-array indexes by adding the row and column starting indexes for this block
     ix += block_id["xstart"]
@@ -151,7 +161,10 @@ def delayed_subsample(
     return_indices=True, then sample your arrays out-of-memory with .vindex[indices[0], indices[1]]
     (this assumes that these arrays have valid values at the same locations).
 
-    :param darr: Input dask array.
+    Only valid values are sampled. If passing a numerical array, then only finite values are considered valid values.
+    If passing a boolean array, then only True values are considered valid values.
+
+    :param darr: Input dask array. This can be a boolean or a numerical array.
     :param subsample: Subsample size. If <= 1, will be considered a fraction of valid pixels to extract.
         If > 1 will be considered the number of valid pixels to extract.
     :param return_indices: If set to True, will return the extracted indices only.
@@ -722,9 +735,11 @@ def delayed_reproject(
     # transform of each tuples of source blocks
     src_block_ids = np.array(src_geotiling.get_block_locations())
     meta_params = [
-        _combined_blocks_shape_transform(sub_block_ids=src_block_ids[sbid], src_geogrid=src_geogrid)
-        if len(sbid) > 0
-        else ({}, [])
+        (
+            _combined_blocks_shape_transform(sub_block_ids=src_block_ids[sbid], src_geogrid=src_geogrid)
+            if len(sbid) > 0
+            else ({}, [])
+        )
         for sbid in dest2source
     ]
     # We also add the output transform/shape for this destination chunk in the combined meta

diff --git a/tests/test_raster/test_delayed.py b/tests/test_raster/test_delayed.py
@@ -1,4 +1,5 @@
 """Tests for dask-delayed functions."""
+
 from __future__ import annotations
 
 import os
@@ -292,10 +293,18 @@ class TestDelayed:
     # List of in-memory chunksize for small tests
     list_small_chunksizes_in_mem = [(10, 10), (7, 19)]
 
-    @pytest.mark.parametrize("darr", list_small_darr)  # type: ignore
+    # create a corresponding boolean array for each numerical dask array
+    # every finite numerical value (valid numerical value) corresponds to True (valid boolean value).
+    darr_bool = []
+    for small_darr in list_small_darr:
+        darr_bool.append(da.where(da.isfinite(small_darr), True, False))
+
+    @pytest.mark.parametrize("darr, darr_bool", list(zip(list_small_darr, darr_bool)))  # type: ignore
     @pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem)  # type: ignore
     @pytest.mark.parametrize("subsample_size", [2, 100, 100000])  # type: ignore
-    def test_delayed_subsample__output(self, darr: da.Array, chunksizes_in_mem: tuple[int, int], subsample_size: int):
+    def test_delayed_subsample__output(
+        self, darr: da.Array, darr_bool: da.Array, chunksizes_in_mem: tuple[int, int], subsample_size: int
+    ):
         """
         Checks for delayed subsampling function for output accuracy.
         Variables that influence specifically the delayed function and might lead to new errors are:
@@ -319,6 +328,15 @@ def test_delayed_subsample__output(self, darr: da.Array, chunksizes_in_mem: tupl
         sub2 = np.array(darr.vindex[indices[0], indices[1]])
         assert np.array_equal(sub, sub2)
 
+        # Finally, to verify that a boolean array, with valid values at the same locations as the numerical array,
+        # leads to the same results, we compare the samples values and the samples indices.
+        darr_bool = darr_bool.rechunk(chunksizes_in_mem)
+        indices_bool = delayed_subsample(darr_bool, subsample=subsample_size, random_state=42, return_indices=True)
+        sub_bool = np.array(darr.vindex[indices_bool])
+        assert np.array_equal(sub, sub_bool)
+        assert np.array_equal(indices, indices_bool)
+
+
     @pytest.mark.parametrize("darr", list_small_darr)  # type: ignore
     @pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem)  # type: ignore
     @pytest.mark.parametrize("ninterp", [2, 100])  # type: ignore