From 730c57defb660c2696966be1c99491d5af16dfce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20H=C3=B6lzer?= Date: Fri, 22 Nov 2024 13:15:54 +0100 Subject: [PATCH] Hardcode chunk-size of datacubes and write docs --- .../darts_acquisition/arcticdem/datacube.py | 7 ++---- .../src/darts_acquisition/tcvis.py | 23 ++++++++++++++++--- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/darts-acquisition/src/darts_acquisition/arcticdem/datacube.py b/darts-acquisition/src/darts_acquisition/arcticdem/datacube.py index 064e267..7674f70 100644 --- a/darts-acquisition/src/darts_acquisition/arcticdem/datacube.py +++ b/darts-acquisition/src/darts_acquisition/arcticdem/datacube.py @@ -21,6 +21,7 @@ RESOLUTIONS = Literal[2, 10, 32] +CHUNK_SIZE = 3600 # https://www.pgc.umn.edu/guides/stereo-derived-elevation-models/pgc-dem-products-arcticdem-rema-and-earthdem DATA_EXTENT = { 2: GeoBox.from_bbox((-3314693.24, -3314693.24, 3314693.24, 3314693.24), "epsg:3413", resolution=2), @@ -250,7 +251,6 @@ def load_arcticdem_tile( geobox: GeoBox, data_dir: Path, resolution: RESOLUTIONS, - chunk_size: int = 6000, buffer: int = 0, persist: bool = True, ) -> xr.Dataset: @@ -260,8 +260,6 @@ def load_arcticdem_tile( geobox (GeoBox): The geobox for which the tile should be loaded. data_dir (Path): The directory where the ArcticDEM data is stored. resolution (Literal[2, 10, 32]): The resolution of the ArcticDEM data in m. - chunk_size (int, optional): The chunk size for the datacube. Only relevant for the initial creation. - Has no effect otherwise. Defaults to 6000. buffer (int, optional): The buffer around the geobox in pixels. Defaults to 0. persist (bool, optional): If the data should be persisted in memory. If not, this will return a Dask backed Dataset. Defaults to True. @@ -276,7 +274,6 @@ def load_arcticdem_tile( 2. Geobox must be in a meter based CRS. """ - # TODO: What is a good chunk size? # TODO: Thread-safety concers: # - How can we ensure that the same arcticdem tile is not downloaded twice at the same time? # - How can we ensure that the extent is not downloaded twice at the same time? @@ -297,7 +294,7 @@ def load_arcticdem_tile( "ArcticDEM Data Cube", storage, DATA_EXTENT[resolution], - chunk_size, + CHUNK_SIZE, DATA_VARS, DATA_VARS_META, DATA_VARS_ENCODING, diff --git a/darts-acquisition/src/darts_acquisition/tcvis.py b/darts-acquisition/src/darts_acquisition/tcvis.py index 6eec0a1..e7546e3 100644 --- a/darts-acquisition/src/darts_acquisition/tcvis.py +++ b/darts-acquisition/src/darts_acquisition/tcvis.py @@ -19,6 +19,7 @@ EE_WARN_MSG = "Unable to retrieve 'system:time_start' values from an ImageCollection due to: No 'system:time_start' values found in the 'ImageCollection'." # noqa: E501 +CHUNK_SIZE = 3600 DATA_EXTENT = GeoBox.from_bbox((-180, 60, 180, 90), "epsg:4326", resolution=0.00026949) DATA_VARS = ["tc_brightness", "tc_greenness", "tc_wetness"] DATA_VARS_META = { @@ -44,6 +45,19 @@ def procedural_download_datacube(storage: zarr.storage.Store, geobox: GeoBox): + """Download the TCVIS data procedurally and add it to the datacube. + + Args: + storage (zarr.storage.Store): The zarr storage object where the datacube will be saved. + geobox (GeoBox): The geobox to download the data for. + + References: + - https://earthmover.io/blog/serverless-datacube-pipeline + + Warning: + This function is not thread-safe. Thread-safety might be added in the future. + + """ tick_fstart = time.perf_counter() # Check if data already exists @@ -145,7 +159,6 @@ def procedural_download_datacube(storage: zarr.storage.Store, geobox: GeoBox): def load_tcvis( geobox: GeoBox, data_dir: Path, - chunk_size: int = 6000, buffer: int = 0, persist: bool = True, ) -> xr.Dataset: @@ -153,6 +166,10 @@ def load_tcvis( Args: geobox (GeoBox): The geobox to load the data for. + data_dir (Path): The directory to store the downloaded data for faster access for consecutive calls. + buffer (int, optional): The buffer around the geobox in pixels. Defaults to 0. + persist (bool, optional): If the data should be persisted in memory. + If not, this will return a Dask backed Dataset. Defaults to True. Returns: xr.Dataset: The TCVIS dataset. @@ -170,14 +187,14 @@ def load_tcvis( title="Landsat Trends TCVIS 2000-2019", storage=storage, geobox=DATA_EXTENT, - chunk_size=chunk_size, + chunk_size=CHUNK_SIZE, data_vars=DATA_VARS, meta=DATA_VARS_META, var_encoding=DATA_VARS_ENCODING, ) # Download the adjacent tiles (if necessary) - reference_geobox = geobox.to_crs("epsg:4326").pad(buffer) + reference_geobox = geobox.to_crs("epsg:4326", resolution=DATA_EXTENT.resolution.x).pad(buffer) procedural_download_datacube(storage, reference_geobox) # Load the datacube and set the spatial_ref since it is set as a coordinate within the zarr format