ClimateImpactLab · brews · Nov 4, 2021 · Nov 4, 2021 · Nov 4, 2021 · Nov 4, 2021
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -6,6 +6,7 @@ History
 0.X.X (XXXX-XX-XX)
 ------------------
 * Add pre-training slicing options to train-qdm and train-aiqpd. (PR #123, @brews)
+* Quick fix validation reading entire zarr store for check. (PR #124, @brews)
 
 
 0.7.0 (2021-11-02)

diff --git a/dodola/core.py b/dodola/core.py
@@ -4,8 +4,9 @@
 """
 
 
-import numpy as np
 import logging
+import dask
+import numpy as np
 from skdownscale.spatial_models import SpatialDisaggregator
 import xarray as xr
 from xclim import sdba, set_options
@@ -509,21 +510,45 @@ def validate_dataset(ds, var, data_type, time_period="future"):
     time_period : {"historical", "future"}
         Time period of data that will be validated.
     """
+    # This is pretty rough but works to communicate the idea.
+    # Consider having failed tests raise something like ValidationError rather
+    # than AssertionErrors.
 
-    # validation for all variables
+    # These only read in Zarr Store metadata -- not memory intensive.
     _test_variable_names(ds, var)
-    _test_for_nans(ds, var)
     _test_timesteps(ds, data_type, time_period)
 
-    # variable specific validation
-    if var == "tasmin" or var == "tasmax":
-        _test_temp_range(ds, var)
-    if var == "dtr":
-        _test_dtr_range(ds, var)
-    if var == "dtr" or var == "pr":
-        _test_negative_values(ds, var)
-    if var == "pr":
-        _test_maximum_precip(ds, var)
+    # Other test are done on annual selections with dask.delayed to
+    # avoid large memory errors. xr.map_blocks had trouble with this.
+    @dask.delayed
+    def memory_intensive_tests(ds, v, t):
+        d = ds.sel(time=str(t))
+
+        _test_for_nans(d, v)
+
+        if v == "tasmin":
+            _test_temp_range(d, v)
+        elif v == "tasmax":
+            _test_temp_range(d, v)
+        elif v == "dtr":
+            _test_dtr_range(d, v)
+            _test_negative_values(d, v)
+        elif v == "pr":
+            _test_negative_values(d, v)
+            _test_maximum_precip(d, v)
+        else:
+            raise ValueError(f"Argument {v=} not recognized")
+
+        # Assumes error thrown if had problem before this.
+        return True
+
+    results = []
+    for t in np.unique(ds["time"].dt.year.data):
+        logger.debug(f"Validating year {t}")
+        results.append(memory_intensive_tests(ds, var, t))
+    results = dask.compute(*results)
+    assert all(results)  # Likely don't need this
+    return True
 
 
 def _test_for_nans(ds, var):