Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Quick fix validation reading entire zarr store for QC check #124

Merged
merged 4 commits into from
Nov 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ History
0.X.X (XXXX-XX-XX)
------------------
* Add pre-training slicing options to train-qdm and train-aiqpd. (PR #123, @brews)
* Quick fix validation reading entire zarr store for check. (PR #124, @brews)


0.7.0 (2021-11-02)
Expand Down
49 changes: 37 additions & 12 deletions dodola/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
"""


import numpy as np
import logging
import dask
import numpy as np
from skdownscale.spatial_models import SpatialDisaggregator
import xarray as xr
from xclim import sdba, set_options
Expand Down Expand Up @@ -509,21 +510,45 @@ def validate_dataset(ds, var, data_type, time_period="future"):
time_period : {"historical", "future"}
Time period of data that will be validated.
"""
# This is pretty rough but works to communicate the idea.
# Consider having failed tests raise something like ValidationError rather
# than AssertionErrors.

# validation for all variables
# These only read in Zarr Store metadata -- not memory intensive.
_test_variable_names(ds, var)
_test_for_nans(ds, var)
_test_timesteps(ds, data_type, time_period)

# variable specific validation
if var == "tasmin" or var == "tasmax":
_test_temp_range(ds, var)
if var == "dtr":
_test_dtr_range(ds, var)
if var == "dtr" or var == "pr":
_test_negative_values(ds, var)
if var == "pr":
_test_maximum_precip(ds, var)
# Other test are done on annual selections with dask.delayed to
# avoid large memory errors. xr.map_blocks had trouble with this.
@dask.delayed
def memory_intensive_tests(ds, v, t):
d = ds.sel(time=str(t))

_test_for_nans(d, v)

if v == "tasmin":
_test_temp_range(d, v)
elif v == "tasmax":
_test_temp_range(d, v)
elif v == "dtr":
_test_dtr_range(d, v)
_test_negative_values(d, v)
elif v == "pr":
_test_negative_values(d, v)
_test_maximum_precip(d, v)
else:
raise ValueError(f"Argument {v=} not recognized")

# Assumes error thrown if had problem before this.
return True

results = []
for t in np.unique(ds["time"].dt.year.data):
logger.debug(f"Validating year {t}")
results.append(memory_intensive_tests(ds, var, t))
results = dask.compute(*results)
assert all(results) # Likely don't need this
return True


def _test_for_nans(ds, var):
Expand Down