From c69897d8e851049ad788fcd896170d2c906cba06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20H=C3=B6lzer?= Date: Mon, 25 Nov 2024 19:16:47 +0100 Subject: [PATCH] Use legacy post processing --- .../darts_postprocessing/prepare_export.py | 174 +++++++++++++++++- .../src/darts_postprocessing/utils.py | 16 ++ darts/src/darts/native.py | 56 +++++- notebooks/test-e2e.ipynb | 47 ++++- 4 files changed, 274 insertions(+), 19 deletions(-) create mode 100644 darts-postprocessing/src/darts_postprocessing/utils.py diff --git a/darts-postprocessing/src/darts_postprocessing/prepare_export.py b/darts-postprocessing/src/darts_postprocessing/prepare_export.py index d9f2fb0..e3dd8d2 100644 --- a/darts-postprocessing/src/darts_postprocessing/prepare_export.py +++ b/darts-postprocessing/src/darts_postprocessing/prepare_export.py @@ -1,33 +1,195 @@ """Prepare the export, e.g. binarizes the data and convert the float probabilities to uint8.""" +import logging +from typing import Literal + +import numpy as np import xarray as xr +from skimage.morphology import binary_erosion, disk, label, remove_small_objects + +from darts_postprocessing.utils import free_cuda + +logger = logging.getLogger(__name__.replace("darts_", "darts.")) + +try: + import cupy as cp + from cucim.skimage.morphology import binary_erosion as binary_erosion_gpu + from cucim.skimage.morphology import disk as disk_gpu + from cucim.skimage.morphology import remove_small_objects as remove_small_objects_gpu + + CUCIM_AVAILABLE = True + DEFAULT_DEVICE = "cuda" + logger.debug("GPU-accelerated xrspatial functions are available.") +except ImportError: + CUCIM_AVAILABLE = False + DEFAULT_DEVICE = "cpu" + logger.debug("GPU-accelerated xrspatial functions are not available.") + + +def erode_mask(mask: xr.DataArray, size: int, device: Literal["cuda", "cpu"] | int) -> xr.DataArray: + """Erode the mask, also set the edges to invalid. + + Args: + mask (xr.DataArray): The mask to erode. + size (int): The size of the disk to use for erosion and the edge-cropping. + device (Literal["cuda", "cpu"] | int): The device to use for erosion. + + Returns: + xr.DataArray: The dilated and inverted mask. + + """ + # Clone mask to avoid in-place operations + mask = mask.copy() + + # Change to dtype uint8 for faster skimage operations + mask = mask.astype("uint8") + + use_gpu = device == "cuda" or isinstance(device, int) + + # Warn user if use_gpu is set but no GPU is available + if use_gpu and not CUCIM_AVAILABLE: + logger.warning( + f"Device was set to {device}, but GPU acceleration is not available. Calculating TPI and slope on CPU." + ) + use_gpu = False + + # Dilate the mask with GPU + if use_gpu: + device_nr = device if isinstance(device, int) else 0 + logger.debug(f"Moving mask to GPU:{device}.") + # Check if mask is dask, if not persist it, since dilation can't be calculated from cupy-dask arrays + if mask.chunks is not None: + mask = mask.persist() + with cp.cuda.Device(device_nr): + mask = mask.cupy.as_cupy() + mask.values = binary_erosion_gpu(mask.data, disk_gpu(size)) + mask = mask.cupy.as_numpy() + free_cuda() + else: + mask.values = binary_erosion(mask.values, disk(size)) + + # Mask edges + mask[:size, :] = 0 + mask[-size:, :] = 0 + mask[:, :size] = 0 + mask[:, -size:] = 0 + return mask -def prepare_export(tile: xr.Dataset) -> xr.Dataset: + +def binarize( + probs: xr.DataArray, + threshold: float, + min_object_size: int, + mask: xr.DataArray, + device: Literal["cuda", "cpu"] | int, +) -> xr.DataArray: + """Binarize the probabilities based on a threshold and a mask. + + Steps for binarization: + 1. Dilate the mask. This will dilate the edges of holes in the mask as well as the edges of the tile. + 2. Binarize the probabilities based on the threshold. + 3. Remove objects at which overlap with either the edge of the tile or the noData mask. + 4. Remove small objects. + + Args: + probs (xr.DataArray): Probabilities to binarize. + threshold (float): Threshold to binarize the probabilities. + min_object_size (int): Minimum object size to keep. + mask (xr.DataArray): Mask to apply to the binarized probabilities. Expects 0=negative, 1=postitive. + device (Literal["cuda", "cpu"] | int): The device to use for removing small objects. + + Returns: + xr.DataArray: Binarized probabilities. + + """ + use_gpu = device == "cuda" or isinstance(device, int) + + # Warn user if use_gpu is set but no GPU is available + if use_gpu and not CUCIM_AVAILABLE: + logger.warning( + f"Device was set to {device}, but GPU acceleration is not available. Calculating TPI and slope on CPU." + ) + use_gpu = False + + # Where the output from the ensemble / segmentation is nan turn it into 0, else threshold it + # Also, where there was no valid input data, turn it into 0 + binarized = (probs.fillna(0) > threshold).astype("uint8") + + # Remove objects at which overlap with either the edge of the tile or the noData mask + labels = binarized.copy(data=label(binarized, connectivity=2)) + edge_label_ids = np.unique(xr.where(~mask, labels, 0)) + binarized = ~labels.isin(edge_label_ids) & binarized + + # Remove small objects with GPU + if use_gpu: + device_nr = device if isinstance(device, int) else 0 + logger.debug(f"Moving binarized to GPU:{device}.") + # Check if binarized is dask, if not persist it, since remove_small_objects_gpu can't be calculated from + # cupy-dask arrays + if binarized.chunks is not None: + binarized = binarized.persist() + with cp.cuda.Device(device_nr): + binarized = binarized.cupy.as_cupy() + binarized.values = remove_small_objects_gpu( + binarized.astype(bool).expand_dims("batch", 0).data, min_size=min_object_size + )[0] + binarized = binarized.cupy.as_numpy() + free_cuda() + else: + binarized.values = remove_small_objects( + binarized.astype(bool).expand_dims("batch", 0).values, min_size=min_object_size + )[0] + + # Convert back to int8 + binarized = binarized.astype("uint8") + + return binarized + + +def prepare_export( + tile: xr.Dataset, + bin_threshold: float = 0.5, + mask_erosion_size: int = 10, + min_object_size: int = 32, + use_quality_mask: bool = False, + device: Literal["cuda", "cpu"] | int = DEFAULT_DEVICE, +) -> xr.Dataset: """Prepare the export, e.g. binarizes the data and convert the float probabilities to uint8. Args: tile (xr.Dataset): Input tile from inference and / or an ensemble. + bin_threshold (float, optional): The threshold to binarize the probabilities. Defaults to 0.5. + mask_erosion_size (int, optional): The size of the disk to use for mask erosion and the edge-cropping. + Defaults to 10. + min_object_size (int, optional): The minimum object size to keep in pixel. Defaults to 32. + use_quality_mask (bool, optional): Whether to use the "quality" mask instead of the + "valid" mask to mask the output. Defaults to False. + device (Literal["cuda", "cpu"] | int, optional): The device to use for dilation. + Defaults to "cuda" if cuda for cucim is available, else "cpu". Returns: xr.Dataset: Output tile. """ + mask_name = "quality_data_mask" if use_quality_mask else "valid_data_mask" + tile[mask_name] = erode_mask(tile[mask_name], mask_erosion_size, device) # 0=positive, 1=negative def _prep_layer(tile, layername, binarized_layer_name): # Binarize the segmentation - # Where the output from the ensemble / segmentation is nan turn it into 0, else threshold it - # Also, where there was no valid input data, turn it into 0 - binarized = (tile[layername].fillna(0) > 0.5).astype("uint8") - tile[binarized_layer_name] = xr.where(tile["valid_data_mask"], binarized, 0) + tile[binarized_layer_name] = binarize(tile[layername], bin_threshold, min_object_size, tile[mask_name], device) tile[binarized_layer_name].attrs = { "long_name": "Binarized Segmentation", } # Convert the probabilities to uint8 # Same but this time with 255 as no-data + # But first check if this step was already run + if tile[layername].max() > 1: + return tile + intprobs = (tile[layername] * 100).fillna(255).astype("uint8") - tile[layername] = xr.where(tile["valid_data_mask"], intprobs, 255) + tile[layername] = xr.where(tile[mask_name], intprobs, 255) tile[layername].attrs = { "long_name": "Probabilities", "units": "%", diff --git a/darts-postprocessing/src/darts_postprocessing/utils.py b/darts-postprocessing/src/darts_postprocessing/utils.py new file mode 100644 index 0000000..be87457 --- /dev/null +++ b/darts-postprocessing/src/darts_postprocessing/utils.py @@ -0,0 +1,16 @@ +"""Utility functions for Darts Preprocessing.""" + +import gc + +try: + import cupy as cp +except ImportError: + cp = None + + +def free_cuda(): + """Free the CUDA memory.""" + if cp is not None: + gc.collect() + cp.get_default_memory_pool().free_all_blocks() + cp.get_default_pinned_memory_pool().free_all_blocks() diff --git a/darts/src/darts/native.py b/darts/src/darts/native.py index 3930f6b..7c7e87e 100644 --- a/darts/src/darts/native.py +++ b/darts/src/darts/native.py @@ -50,6 +50,10 @@ def run_native_planet_pipeline( overlap: int = 16, batch_size: int = 8, reflection: int = 0, + binarization_threshold: float = 0.5, + mask_erosion_size: int = 10, + min_object_size: int = 32, + use_quality_mask: bool = False, write_model_outputs: bool = False, ): """Search for all PlanetScope scenes in the given directory and runs the segmentation pipeline on them. @@ -74,6 +78,12 @@ def run_native_planet_pipeline( overlap (int, optional): The overlap to use for inference. Defaults to 16. batch_size (int, optional): The batch size to use for inference. Defaults to 8. reflection (int, optional): The reflection padding to use for inference. Defaults to 0. + binarization_threshold (float, optional): The threshold to binarize the probabilities. Defaults to 0.5. + mask_erosion_size (int, optional): The size of the disk to use for mask erosion and the edge-cropping. + Defaults to 10. + min_object_size (int, optional): The minimum object size to keep in pixel. Defaults to 32. + use_quality_mask (bool, optional): Whether to use the "quality" mask instead of the "valid" mask + to mask the output. write_model_outputs (bool, optional): Also save the model outputs, not only the ensemble result. Defaults to False. @@ -184,7 +194,9 @@ def run_native_planet_pipeline( reflection=reflection, keep_inputs=write_model_outputs, ) - tile = prepare_export(tile) + tile = prepare_export( + tile, binarization_threshold, mask_erosion_size, min_object_size, use_quality_mask, device + ) outpath.mkdir(parents=True, exist_ok=True) writer = InferenceResultWriter(tile) @@ -213,6 +225,10 @@ def run_native_planet_pipeline_fast( overlap: int = 16, batch_size: int = 8, reflection: int = 0, + binarization_threshold: float = 0.5, + mask_erosion_size: int = 10, + min_object_size: int = 32, + use_quality_mask: bool = False, write_model_outputs: bool = False, ): """Search for all PlanetScope scenes in the given directory and runs the segmentation pipeline on them. @@ -243,6 +259,12 @@ def run_native_planet_pipeline_fast( overlap (int, optional): The overlap to use for inference. Defaults to 16. batch_size (int, optional): The batch size to use for inference. Defaults to 8. reflection (int, optional): The reflection padding to use for inference. Defaults to 0. + binarization_threshold (float, optional): The threshold to binarize the probabilities. Defaults to 0.5. + mask_erosion_size (int, optional): The size of the disk to use for mask erosion and the edge-cropping. + Defaults to 10. + min_object_size (int, optional): The minimum object size to keep in pixel. Defaults to 32. + use_quality_mask (bool, optional): Whether to use the "quality" mask instead of the "valid" mask + to mask the output. write_model_outputs (bool, optional): Also save the model outputs, not only the ensemble result. Defaults to False. @@ -304,7 +326,9 @@ def run_native_planet_pipeline_fast( reflection=reflection, keep_inputs=write_model_outputs, ) - tile = prepare_export(tile) + tile = prepare_export( + tile, binarization_threshold, mask_erosion_size, min_object_size, use_quality_mask, device + ) outpath.mkdir(parents=True, exist_ok=True) writer = InferenceResultWriter(tile) @@ -331,6 +355,10 @@ def run_native_sentinel2_pipeline( overlap: int = 16, batch_size: int = 8, reflection: int = 0, + binarization_threshold: float = 0.5, + mask_erosion_size: int = 10, + min_object_size: int = 32, + use_quality_mask: bool = False, write_model_outputs: bool = False, ): """Search for all Sentinel scenes in the given directory and runs the segmentation pipeline on them. @@ -354,6 +382,12 @@ def run_native_sentinel2_pipeline( overlap (int, optional): The overlap to use for inference. Defaults to 16. batch_size (int, optional): The batch size to use for inference. Defaults to 8. reflection (int, optional): The reflection padding to use for inference. Defaults to 0. + binarization_threshold (float, optional): The threshold to binarize the probabilities. Defaults to 0.5. + mask_erosion_size (int, optional): The size of the disk to use for mask erosion and the edge-cropping. + Defaults to 10. + min_object_size (int, optional): The minimum object size to keep in pixel. Defaults to 32. + use_quality_mask (bool, optional): Whether to use the "quality" mask instead of the "valid" mask + to mask the output. write_model_outputs (bool, optional): Also save the model outputs, not only the ensemble result. Defaults to False. @@ -429,7 +463,9 @@ def run_native_sentinel2_pipeline( reflection=reflection, keep_inputs=write_model_outputs, ) - tile = prepare_export(tile) + tile = prepare_export( + tile, binarization_threshold, mask_erosion_size, min_object_size, use_quality_mask, device + ) outpath.mkdir(parents=True, exist_ok=True) writer = InferenceResultWriter(tile) @@ -457,6 +493,10 @@ def run_native_sentinel2_pipeline_fast( overlap: int = 16, batch_size: int = 8, reflection: int = 0, + binarization_threshold: float = 0.5, + mask_erosion_size: int = 10, + min_object_size: int = 32, + use_quality_mask: bool = False, write_model_outputs: bool = False, ): """Search for all Sentinel 2 scenes in the given directory and runs the segmentation pipeline on them. @@ -487,6 +527,12 @@ def run_native_sentinel2_pipeline_fast( overlap (int, optional): The overlap to use for inference. Defaults to 16. batch_size (int, optional): The batch size to use for inference. Defaults to 8. reflection (int, optional): The reflection padding to use for inference. Defaults to 0. + binarization_threshold (float, optional): The threshold to binarize the probabilities. Defaults to 0.5. + mask_erosion_size (int, optional): The size of the disk to use for mask erosion and the edge-cropping. + Defaults to 10. + min_object_size (int, optional): The minimum object size to keep in pixel. Defaults to 32. + use_quality_mask (bool, optional): Whether to use the "quality" mask instead of the "valid" mask + to mask the output. write_model_outputs (bool, optional): Also save the model outputs, not only the ensemble result. Defaults to False. @@ -551,7 +597,9 @@ def run_native_sentinel2_pipeline_fast( reflection=reflection, keep_inputs=write_model_outputs, ) - tile = prepare_export(tile) + tile = prepare_export( + tile, binarization_threshold, mask_erosion_size, min_object_size, use_quality_mask, device + ) outpath.mkdir(parents=True, exist_ok=True) writer = InferenceResultWriter(tile) diff --git a/notebooks/test-e2e.ipynb b/notebooks/test-e2e.ipynb index f316d6f..b305e0b 100644 --- a/notebooks/test-e2e.ipynb +++ b/notebooks/test-e2e.ipynb @@ -7,9 +7,10 @@ "outputs": [], "source": [ "import logging\n", - "from math import ceil\n", + "from math import ceil, sqrt\n", "from pathlib import Path\n", "\n", + "import folium\n", "import holoviews as hv\n", "import hvplot.xarray # noqa\n", "import xarray as xr\n", @@ -73,7 +74,17 @@ " )\n", " for z in tile.data_vars\n", " ]\n", - " return hv.Layout(var_plots).cols(ncols)" + " return hv.Layout(var_plots).cols(ncols)\n", + "\n", + "\n", + "def plot_tile_interactive(tile: xr.Dataset) -> folium.Map: # noqa\n", + " m = folium.Map()\n", + "\n", + " for z in tile.data_vars:\n", + " tile[z].odc.add_to(map=m, name=z)\n", + "\n", + " folium.LayerControl().add_to(m)\n", + " return m" ] }, { @@ -84,13 +95,14 @@ "source": [ "cache_file = DATA_ROOT / \"intermediate\" / f\"planet_{fpath.stem}.nc\"\n", "force = True\n", - "slc = {\"x\": slice(0, 1000), \"y\": slice(7000, 8000)}\n", + "slc = {\"x\": slice(0, 2000), \"y\": slice(6000, 8000)}\n", "if cache_file.exists() and not force:\n", " tile = xr.open_dataset(cache_file, engine=\"h5netcdf\", mask_and_scale=False).set_coords(\"spatial_ref\")\n", "else:\n", " tpi_outer_radius = 100\n", - " optical = load_planet_scene(fpath) # .isel(slc)\n", - " arcticdem = load_arcticdem_tile(optical.odc.geobox, arcticdem_dir, buffer=ceil(tpi_outer_radius / 2), resolution=2)\n", + " buffer = ceil(tpi_outer_radius / 2 * sqrt(2))\n", + " optical = load_planet_scene(fpath).isel(slc)\n", + " arcticdem = load_arcticdem_tile(optical.odc.geobox, arcticdem_dir, buffer=buffer, resolution=2)\n", " tcvis = load_tcvis(optical.odc.geobox, tcvis_dir)\n", " data_masks = load_planet_masks(fpath).isel(slc)\n", " tile = preprocess_legacy_fast(optical, arcticdem, tcvis, data_masks, tpi_outer_radius)\n", @@ -98,7 +110,7 @@ " tile.to_netcdf(cache_file, engine=\"h5netcdf\")\n", "\n", "display(tile)\n", - "plot_tile(tile)" + "# plot_tile(tile)" ] }, { @@ -115,7 +127,17 @@ "logging.info(ensemble.rts_v6_notcvis_model.config[\"input_combination\"])\n", "tile = ensemble(tile, batch_size=4, keep_inputs=True, patch_size=1024, overlap=256)\n", "display(tile)\n", - "plot_tile(tile)" + "# plot_tile(tile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tile = prepare_export(tile, use_quality_mask=True)\n", + "tile" ] }, { @@ -124,10 +146,17 @@ "metadata": {}, "outputs": [], "source": [ - "tile = prepare_export(tile)\n", - "display(tile)\n", "plot_tile(tile)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_tile_interactive(tile)" + ] } ], "metadata": {