Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allowing boolean array in delayed_subsample #9

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 28 additions & 13 deletions geoutils/raster/delayed.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Module for dask-delayed functions for out-of-memory raster operations.
"""

from __future__ import annotations

import warnings
Expand All @@ -15,7 +16,7 @@
from dask.utils import cached_cumsum
from scipy.interpolate import interpn

from geoutils._typing import NDArrayNum
from geoutils._typing import NDArrayBool, NDArrayNum
from geoutils.projtools import _get_bounds_projected, _get_footprint_projected

# 1/ SUBSAMPLING
Expand Down Expand Up @@ -96,28 +97,37 @@ def _get_indices_block_per_subsample(


@dask.delayed # type: ignore
def _delayed_nb_valids(arr_chunk: NDArrayNum) -> NDArrayNum:
def _delayed_nb_valids(arr_chunk: NDArrayNum | NDArrayBool) -> NDArrayNum:
"""Count number of valid values per block."""
if arr_chunk.dtype == "bool":
return np.array([np.count_nonzero(arr_chunk)]).reshape((1, 1))
return np.array([np.count_nonzero(np.isfinite(arr_chunk))]).reshape((1, 1))


@dask.delayed # type: ignore
def _delayed_subsample_block(arr_chunk: NDArrayNum, subsample_indices: NDArrayNum) -> NDArrayNum:
def _delayed_subsample_block(
arr_chunk: NDArrayNum | NDArrayBool, subsample_indices: NDArrayNum
) -> NDArrayNum | NDArrayBool:
"""Subsample the valid values at the corresponding 1D valid indices per block."""

s_chunk = arr_chunk[np.isfinite(arr_chunk)][subsample_indices]

return s_chunk
if arr_chunk.dtype == "bool":
return arr_chunk[arr_chunk][subsample_indices]
return arr_chunk[np.isfinite(arr_chunk)][subsample_indices]


@dask.delayed # type: ignore
def _delayed_subsample_indices_block(
arr_chunk: NDArrayNum, subsample_indices: NDArrayNum, block_id: dict[str, Any]
arr_chunk: NDArrayNum | NDArrayBool, subsample_indices: NDArrayNum, block_id: dict[str, Any]
) -> NDArrayNum:
"""Return 2D indices from the subsampled 1D valid indices per block."""

# Unravel indices of valid data to the shape of the block
ix, iy = np.unravel_index(np.argwhere(np.isfinite(arr_chunk.flatten()))[subsample_indices], shape=arr_chunk.shape)
if arr_chunk.dtype == "bool":
ix, iy = np.unravel_index(np.argwhere(arr_chunk.flatten())[subsample_indices], shape=arr_chunk.shape)
else:
# Unravel indices of valid data to the shape of the block
ix, iy = np.unravel_index(
np.argwhere(np.isfinite(arr_chunk.flatten()))[subsample_indices], shape=arr_chunk.shape
)

# Convert to full-array indexes by adding the row and column starting indexes for this block
ix += block_id["xstart"]
Expand Down Expand Up @@ -151,7 +161,10 @@ def delayed_subsample(
return_indices=True, then sample your arrays out-of-memory with .vindex[indices[0], indices[1]]
(this assumes that these arrays have valid values at the same locations).

:param darr: Input dask array.
Only valid values are sampled. If passing a numerical array, then only finite values are considered valid values.
If passing a boolean array, then only True values are considered valid values.

:param darr: Input dask array. This can be a boolean or a numerical array.
:param subsample: Subsample size. If <= 1, will be considered a fraction of valid pixels to extract.
If > 1 will be considered the number of valid pixels to extract.
:param return_indices: If set to True, will return the extracted indices only.
Expand Down Expand Up @@ -722,9 +735,11 @@ def delayed_reproject(
# transform of each tuples of source blocks
src_block_ids = np.array(src_geotiling.get_block_locations())
meta_params = [
_combined_blocks_shape_transform(sub_block_ids=src_block_ids[sbid], src_geogrid=src_geogrid)
if len(sbid) > 0
else ({}, [])
(
_combined_blocks_shape_transform(sub_block_ids=src_block_ids[sbid], src_geogrid=src_geogrid)
if len(sbid) > 0
else ({}, [])
)
for sbid in dest2source
]
# We also add the output transform/shape for this destination chunk in the combined meta
Expand Down
22 changes: 20 additions & 2 deletions tests/test_raster/test_delayed.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Tests for dask-delayed functions."""

from __future__ import annotations

import os
Expand Down Expand Up @@ -292,10 +293,18 @@ class TestDelayed:
# List of in-memory chunksize for small tests
list_small_chunksizes_in_mem = [(10, 10), (7, 19)]

@pytest.mark.parametrize("darr", list_small_darr) # type: ignore
# create a corresponding boolean array for each numerical dask array
# every finite numerical value (valid numerical value) corresponds to True (valid boolean value).
darr_bool = []
for small_darr in list_small_darr:
darr_bool.append(da.where(da.isfinite(small_darr), True, False))

@pytest.mark.parametrize("darr, darr_bool", list(zip(list_small_darr, darr_bool))) # type: ignore
@pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem) # type: ignore
@pytest.mark.parametrize("subsample_size", [2, 100, 100000]) # type: ignore
def test_delayed_subsample__output(self, darr: da.Array, chunksizes_in_mem: tuple[int, int], subsample_size: int):
def test_delayed_subsample__output(
self, darr: da.Array, darr_bool: da.Array, chunksizes_in_mem: tuple[int, int], subsample_size: int
):
"""
Checks for delayed subsampling function for output accuracy.
Variables that influence specifically the delayed function and might lead to new errors are:
Expand All @@ -319,6 +328,15 @@ def test_delayed_subsample__output(self, darr: da.Array, chunksizes_in_mem: tupl
sub2 = np.array(darr.vindex[indices[0], indices[1]])
assert np.array_equal(sub, sub2)

# Finally, to verify that a boolean array, with valid values at the same locations as the numerical array,
# leads to the same results, we compare the samples values and the samples indices.
darr_bool = darr_bool.rechunk(chunksizes_in_mem)
indices_bool = delayed_subsample(darr_bool, subsample=subsample_size, random_state=42, return_indices=True)
sub_bool = np.array(darr.vindex[indices_bool])
assert np.array_equal(sub, sub_bool)
assert np.array_equal(indices, indices_bool)


@pytest.mark.parametrize("darr", list_small_darr) # type: ignore
@pytest.mark.parametrize("chunksizes_in_mem", list_small_chunksizes_in_mem) # type: ignore
@pytest.mark.parametrize("ninterp", [2, 100]) # type: ignore
Expand Down