Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix nan values in nominal x/y coordinates #79

Merged
merged 6 commits into from
Jan 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 26 additions & 7 deletions cmip6_preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ def broadcast_lonlat(ds, verbose=True):
return ds


def _interp_nominal_lon(lon_1d):
x = np.arange(len(lon_1d))
idx = np.isnan(lon_1d)
return np.interp(x, x[~idx], lon_1d[~idx], period=360)


def replace_x_y_nominal_lat_lon(ds):
"""Approximate the dimensional values of x and y with mean lat and lon at the equator"""
ds = ds.copy()
Expand Down Expand Up @@ -135,14 +141,27 @@ def maybe_fix_non_unique(data, pad=False):
return data

if "x" in ds.dims and "y" in ds.dims:
# define 'nominal' longitude/latitude values
# latitude is defined as the max value of `lat` in the zonal direction
# longitude is taken from the `middle` of the meridonal direction, to
# get values close to the equator

# pick the nominal lon/lat values from the eastern
# and southern edge, and eliminate non unique values
# these occour e.g. in "MPI-ESM1-2-HR"
max_lat_idx = ds.lat.isel(y=-1).argmax("x").load().data
nominal_y = maybe_fix_non_unique(ds.isel(x=max_lat_idx).lat.load().data)
# and southern edge, and
eq_idx = len(ds.y) // 2
nominal_x = maybe_fix_non_unique(ds.isel(y=eq_idx).lon.load().data)

nominal_x = ds.isel(y=eq_idx).lon.load()
nominal_y = ds.lat.max("x").load()

# interpolate nans
# Special treatment for gaps in longitude
nominal_x = _interp_nominal_lon(nominal_x.data)
nominal_y = nominal_y.interpolate_na("y").data

# eliminate non unique values
# these occour e.g. in "MPI-ESM1-2-HR"
nominal_y = maybe_fix_non_unique(nominal_y)
nominal_x = maybe_fix_non_unique(nominal_x)

ds = ds.assign_coords(x=nominal_x, y=nominal_y)
ds = ds.sortby("x")
Expand Down Expand Up @@ -230,8 +249,8 @@ def correct_lon(ds):

# remove out of bounds values found in some
# models as missing values
ds["lon"] = ds["lon"].where(abs(ds["lon"]) <= 1e35)
ds["lat"] = ds["lat"].where(abs(ds["lat"]) <= 1e35)
ds["lon"] = ds["lon"].where(abs(ds["lon"]) <= 1000)
ds["lat"] = ds["lat"].where(abs(ds["lat"]) <= 1000)

# adjust lon convention
lon = ds["lon"].where(ds["lon"] > 0, 360 + ds["lon"])
Expand Down
30 changes: 26 additions & 4 deletions cmip6_preprocessing/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ def test_promote_empty_dims():
assert set(["x", "y", "z"]).issubset(set(ds_promoted.coords))


@pytest.mark.parametrize("nans", [True, False])
@pytest.mark.parametrize("dask", [True, False])
def test_replace_x_y_nominal_lat_lon(dask):
def test_replace_x_y_nominal_lat_lon(dask, nans):
x = np.linspace(0, 720, 10)
y = np.linspace(-200, 140, 5)
lon = xr.DataArray(np.linspace(0, 360, len(x)), coords=[("x", x)])
Expand All @@ -104,21 +105,39 @@ def test_replace_x_y_nominal_lat_lon(dask):
ds.coords["lon"] = llon
ds.coords["lat"] = llat

if nans:
lon = ds["lon"].load().data
lon[0, :] = np.nan
lon[-1, :] = np.nan
lon[:, 0] = np.nan
lon[:, -1] = np.nan
lon[15:23, 23:26] = np.nan
ds["lon"].data = lon

# for lats put only some nans in the middle.
# I currently have no way to interpolate lats at the edge.
lat = ds["lat"].load().data
lat[15:23, 23:26] = np.nan
ds["lat"].data = lat

if dask:
ds = ds.chunk({"x": -1, "y": -1})
ds.coords["lon"] = ds.coords["lon"].chunk({"x": -1, "y": -1})
ds.coords["lat"] = ds.coords["lat"].chunk({"x": -1, "y": -1})

replaced_ds = replace_x_y_nominal_lat_lon(ds)

np.testing.assert_allclose(replaced_ds.x, lon)
np.testing.assert_allclose(replaced_ds.y, lat)
assert all(~np.isnan(replaced_ds.x))
assert all(~np.isnan(replaced_ds.y))

assert all(replaced_ds.x.diff("x") > 0)
assert all(replaced_ds.y.diff("y") > 0)
assert len(replaced_ds.lon.shape) == 2
assert len(replaced_ds.lat.shape) == 2
assert set(replaced_ds.lon.dims) == set(["x", "y"])
assert set(replaced_ds.lat.dims) == set(["x", "y"])
assert all(~np.isnan(replaced_ds.x))
assert all(~np.isnan(replaced_ds.y))

# test a dataset that would result in duplicates with current method
x = np.linspace(0, 720, 4)
Expand All @@ -142,6 +161,8 @@ def test_replace_x_y_nominal_lat_lon(dask):
ds.coords["lat"] = ds.coords["lat"].chunk({"x": -1, "y": -1})

replaced_ds = replace_x_y_nominal_lat_lon(ds)
assert all(~np.isnan(replaced_ds.x))
assert all(~np.isnan(replaced_ds.y))
assert len(replaced_ds.y) == len(np.unique(replaced_ds.y))
assert len(replaced_ds.x) == len(np.unique(replaced_ds.x))
# make sure values are sorted in ascending order
Expand Down Expand Up @@ -214,7 +235,7 @@ def test_parse_lon_lat_bounds():
assert "time" not in ds_test2.variables


@pytest.mark.parametrize("missing_values", [False, 1e36, -1e36])
@pytest.mark.parametrize("missing_values", [False, 1e36, -1e36, 1001, -1001])
@pytest.mark.parametrize(
"shift",
[
Expand All @@ -234,6 +255,7 @@ def test_correct_lon(missing_values, shift):
lon = ds["lon"].load().data
lon[10:20, 10:20] = missing_values
ds["lon"].data = lon

ds_lon_corrected = correct_lon(ds)
assert ds_lon_corrected.lon.min() >= 0
assert ds_lon_corrected.lon.max() <= 360
Expand Down
13 changes: 4 additions & 9 deletions cmip6_preprocessing/tests/test_preprocessing_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

pytest.importorskip("gcsfs")

# test_models = ["CESM2-FV", "GFDL-ESM4", "GFDL-CM4", "CanESM5"]
# test_models = ["CESM2-FV2", "GFDL-CM4"]
test_models = all_models()


Expand Down Expand Up @@ -47,15 +47,13 @@ def spec_check_dim_coord_values_wo_intake(request, gl, vi, ei):
("AWI-CM-1-1-MR", "thetao", "historical", "gn"),
("AWI-CM-1-1-MR", "thetao", "ssp585", "gn"),
# TODO: would be nice to have a "*" matching...
("CESM2-FV2", "thetao", "historical", "gn"),
# (
# "GFDL-CM4",
# "thetao",
# "historical",
# "gn",
# ), # this should not fail and should trigger an xpass (I just use this for dev purposes to check
# # the strict option)
("CESM2-FV2", "thetao", "ssp585", "gn"),
]
spec = (request.param, vi, ei, gl)
request.param = spec
Expand Down Expand Up @@ -96,7 +94,7 @@ def test_check_dim_coord_values_wo_intake(
diagnose_doubles(ds[di].load().data)
assert len(ds[di]) == len(np.unique(ds[di]))
if di != "time": # these tests do not make sense for decoded time
assert ~np.all(np.isnan(ds[di]))
assert np.all(~np.isnan(ds[di]))
assert np.all(ds[di].diff(di) >= 0)

assert ds.lon.min().load() >= 0
Expand All @@ -108,6 +106,7 @@ def test_check_dim_coord_values_wo_intake(
assert ds.lat.max().load() <= 90
# make sure lon and lat are 2d
assert len(ds.lon.shape) == 2
assert len(ds.lat.shape) == 2


# this fixture has to be redifined every time to account for different fail cases for each test
Expand All @@ -119,8 +118,6 @@ def spec_check_dim_coord_values(request, gl, vi, ei):
("AWI-CM-1-1-MR", "thetao", "historical", "gn"),
("AWI-CM-1-1-MR", "thetao", "ssp585", "gn"),
# TODO: would be nice to have a "*" matching...
("CESM2-FV2", "thetao", "historical", "gn"),
("CESM2-FV2", "thetao", "ssp585", "gn"),
(
"IPSL-CM6A-LR",
"thetao",
Expand Down Expand Up @@ -167,7 +164,7 @@ def test_check_dim_coord_values(
diagnose_doubles(ds[di].load().data)
assert len(ds[di]) == len(np.unique(ds[di]))
if di != "time": # these tests do not make sense for decoded time
assert ~np.all(np.isnan(ds[di]))
assert np.all(~np.isnan(ds[di]))
assert np.all(ds[di].diff(di) >= 0)

assert ds.lon.min().load() >= 0
Expand All @@ -194,7 +191,6 @@ def spec_check_bounds_verticies(request, gl, vi, ei):
("AWI-ESM-1-1-MR", "thetao", "ssp585", "gn"),
("AWI-CM-1-1-MR", "thetao", "historical", "gn"),
("AWI-CM-1-1-MR", "thetao", "ssp585", "gn"),
("CESM2-FV2", "thetao", "historical", "gn"),
("FGOALS-f3-L", "thetao", "historical", "gn"),
("FGOALS-f3-L", "thetao", "ssp585", "gn"),
("FGOALS-g3", "thetao", "historical", "gn"),
Expand Down Expand Up @@ -284,7 +280,6 @@ def spec_check_grid(request, gl, vi, ei):
("AWI-ESM-1-1-MR", "thetao", "ssp585", "gn"),
("AWI-CM-1-1-MR", "thetao", "historical", "gn"),
("AWI-CM-1-1-MR", "thetao", "ssp585", "gn"),
("CESM2-FV2", "thetao", "historical", "gn"),
("CMCC-CM2-SR5", "thetao", "historical", "gn"),
("CMCC-CM2-SR5", "thetao", "ssp585", "gn"),
("FGOALS-f3-L", "thetao", "historical", "gn"),
Expand Down
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ v0.2.0 (unreleased)

Breaking changes
~~~~~~~~~~~~~~~~
- Further refactor of `replace_x_y_nominal_lat_lon`, which avoids missing values in the dimension coordinates (:issue:`66`) (:pull:`79`)
By `Julius Busecke <https://github.com/jbusecke>`_

- Consistent treatment of cf-style bounds. The combination of `parse_lon_lat_bounds`,
`maybe_convert_bounds_to_vertex`, `maybe_convert_vertex_to_bounds`, and `sort_vertex_order` applied on the dataset, assures
that all datasets have both conventions available and the vertex order is the same.
Expand Down