From 5c3554d536f792c2cabf11fc1ab211242ee8aae6 Mon Sep 17 00:00:00 2001 From: Brewster Malevich Date: Wed, 3 Nov 2021 18:01:30 -0700 Subject: [PATCH 1/4] Quick fix validation reading entire zarr store for QC check --- HISTORY.rst | 1 + dodola/core.py | 48 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 7557a5e..7b5a712 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -6,6 +6,7 @@ History 0.X.X (XXXX-XX-XX) ------------------ * Add pre-training slicing options to train-qdm and train-aiqpd. (PR #123, @brews) +* Quick fix validation reading entire zarr store for check. (@brews) 0.7.0 (2021-11-02) diff --git a/dodola/core.py b/dodola/core.py index 7aecedd..562943c 100644 --- a/dodola/core.py +++ b/dodola/core.py @@ -4,8 +4,9 @@ """ -import numpy as np import logging +import dask +import numpy as np from skdownscale.spatial_models import SpatialDisaggregator import xarray as xr from xclim import sdba, set_options @@ -509,21 +510,44 @@ def validate_dataset(ds, var, data_type, time_period="future"): time_period : {"historical", "future"} Time period of data that will be validated. """ + # This is pretty rough but works to communicate the idea. + # Should have this raise something like ValidationError rather than + # AssertionErrors. - # validation for all variables + # These only read in Zarr Store metadata -- not memory intensive. _test_variable_names(ds, var) - _test_for_nans(ds, var) _test_timesteps(ds, data_type, time_period) - # variable specific validation - if var == "tasmin" or var == "tasmax": - _test_temp_range(ds, var) - if var == "dtr": - _test_dtr_range(ds, var) - if var == "dtr" or var == "pr": - _test_negative_values(ds, var) - if var == "pr": - _test_maximum_precip(ds, var) + # Other test are done on annual selections with dask.delayed to + # avoid large memory errors. xr.map_blocks had trouble with this. + @dask.delayed + def clear_memory_intensive_tests(ds, v, t): + d = ds.sel(time=str(t)) + + _test_for_nans(d, v) + + if v == "tasmin" or v == "tasmax": + _test_temp_range(d, v) + if v == "dtr": + _test_dtr_range(d, v) + if v == "dtr" or v == "pr": + _test_negative_values(d, v) + if v == "pr": + _test_maximum_precip(d, v) + else: + raise ValueError(f"Argument {v=} not recognized") + + # Assumes error thrown if had problem before this. + return True + + tasks = [] + for t in np.unique(ds["time"].dt.year.data): + logger.debug(f"Validating for year {t}") + test_results = clear_memory_intensive_tests(ds, var, t) + tasks.append(test_results) + tasks = dask.compute(*tasks) + assert all(tasks) # Likely don't need this + return True def _test_for_nans(ds, var): From e2451581cca0b19e49a5153dce8ffcf11f6acad5 Mon Sep 17 00:00:00 2001 From: Brewster Malevich Date: Wed, 3 Nov 2021 18:03:11 -0700 Subject: [PATCH 2/4] Add PR number to HISTORY --- HISTORY.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.rst b/HISTORY.rst index 7b5a712..31c7ed1 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -6,7 +6,7 @@ History 0.X.X (XXXX-XX-XX) ------------------ * Add pre-training slicing options to train-qdm and train-aiqpd. (PR #123, @brews) -* Quick fix validation reading entire zarr store for check. (@brews) +* Quick fix validation reading entire zarr store for check. (PR #124, @brews) 0.7.0 (2021-11-02) From 6658bf0ca1699974f7f31849b0bcc54f27538256 Mon Sep 17 00:00:00 2001 From: Brewster Malevich Date: Wed, 3 Nov 2021 18:23:01 -0700 Subject: [PATCH 3/4] Clean up sloppy variable check, make per-variable check cleaner --- dodola/core.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/dodola/core.py b/dodola/core.py index 562943c..e549e15 100644 --- a/dodola/core.py +++ b/dodola/core.py @@ -511,8 +511,8 @@ def validate_dataset(ds, var, data_type, time_period="future"): Time period of data that will be validated. """ # This is pretty rough but works to communicate the idea. - # Should have this raise something like ValidationError rather than - # AssertionErrors. + # Consider having failed tests raise something like ValidationError rather + # than AssertionErrors. # These only read in Zarr Store metadata -- not memory intensive. _test_variable_names(ds, var) @@ -521,18 +521,20 @@ def validate_dataset(ds, var, data_type, time_period="future"): # Other test are done on annual selections with dask.delayed to # avoid large memory errors. xr.map_blocks had trouble with this. @dask.delayed - def clear_memory_intensive_tests(ds, v, t): + def memory_intensive_tests(ds, v, t): d = ds.sel(time=str(t)) _test_for_nans(d, v) - if v == "tasmin" or v == "tasmax": + if v == "tasmin": _test_temp_range(d, v) - if v == "dtr": + elif v == "tasmax": + _test_temp_range(d, v) + elif v == "dtr": _test_dtr_range(d, v) - if v == "dtr" or v == "pr": _test_negative_values(d, v) - if v == "pr": + elif v == "pr": + _test_negative_values(d, v) _test_maximum_precip(d, v) else: raise ValueError(f"Argument {v=} not recognized") @@ -543,7 +545,7 @@ def clear_memory_intensive_tests(ds, v, t): tasks = [] for t in np.unique(ds["time"].dt.year.data): logger.debug(f"Validating for year {t}") - test_results = clear_memory_intensive_tests(ds, var, t) + test_results = memory_intensive_tests(ds, var, t) tasks.append(test_results) tasks = dask.compute(*tasks) assert all(tasks) # Likely don't need this From 4824c595a3595ce70397992131c1106c7bc93e29 Mon Sep 17 00:00:00 2001 From: Brewster Malevich Date: Wed, 3 Nov 2021 18:30:16 -0700 Subject: [PATCH 4/4] Make variables pretty, minor cleanup --- dodola/core.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dodola/core.py b/dodola/core.py index e549e15..f641448 100644 --- a/dodola/core.py +++ b/dodola/core.py @@ -542,13 +542,12 @@ def memory_intensive_tests(ds, v, t): # Assumes error thrown if had problem before this. return True - tasks = [] + results = [] for t in np.unique(ds["time"].dt.year.data): - logger.debug(f"Validating for year {t}") - test_results = memory_intensive_tests(ds, var, t) - tasks.append(test_results) - tasks = dask.compute(*tasks) - assert all(tasks) # Likely don't need this + logger.debug(f"Validating year {t}") + results.append(memory_intensive_tests(ds, var, t)) + results = dask.compute(*results) + assert all(results) # Likely don't need this return True