jbusecke · jbusecke · Jan 29, 2021 · Jan 29, 2021 · Jan 29, 2021 · Jan 29, 2021
diff --git a/cmip6_preprocessing/preprocessing.py b/cmip6_preprocessing/preprocessing.py
@@ -107,6 +107,12 @@ def broadcast_lonlat(ds, verbose=True):
     return ds
 
 
+def _interp_nominal_lon(lon_1d):
+    x = np.arange(len(lon_1d))
+    idx = np.isnan(lon_1d)
+    return np.interp(x, x[~idx], lon_1d[~idx], period=360)
+
+
 def replace_x_y_nominal_lat_lon(ds):
     """Approximate the dimensional values of x and y with mean lat and lon at the equator"""
     ds = ds.copy()
@@ -135,14 +141,27 @@ def maybe_fix_non_unique(data, pad=False):
             return data
 
     if "x" in ds.dims and "y" in ds.dims:
+        # define 'nominal' longitude/latitude values
+        # latitude is defined as the max value of `lat` in the zonal direction
+        # longitude is taken from the `middle` of the meridonal direction, to
+        # get values close to the equator
 
         # pick the nominal lon/lat values from the eastern
-        # and southern edge, and eliminate non unique values
-        # these occour e.g. in "MPI-ESM1-2-HR"
-        max_lat_idx = ds.lat.isel(y=-1).argmax("x").load().data
-        nominal_y = maybe_fix_non_unique(ds.isel(x=max_lat_idx).lat.load().data)
+        # and southern edge, and
         eq_idx = len(ds.y) // 2
-        nominal_x = maybe_fix_non_unique(ds.isel(y=eq_idx).lon.load().data)
+
+        nominal_x = ds.isel(y=eq_idx).lon.load()
+        nominal_y = ds.lat.max("x").load()
+
+        # interpolate nans
+        # Special treatment for gaps in longitude
+        nominal_x = _interp_nominal_lon(nominal_x.data)
+        nominal_y = nominal_y.interpolate_na("y").data
+
+        # eliminate non unique values
+        # these occour e.g. in "MPI-ESM1-2-HR"
+        nominal_y = maybe_fix_non_unique(nominal_y)
+        nominal_x = maybe_fix_non_unique(nominal_x)
 
         ds = ds.assign_coords(x=nominal_x, y=nominal_y)
         ds = ds.sortby("x")
@@ -230,8 +249,8 @@ def correct_lon(ds):
 
     # remove out of bounds values found in some
     # models as missing values
-    ds["lon"] = ds["lon"].where(abs(ds["lon"]) <= 1e35)
-    ds["lat"] = ds["lat"].where(abs(ds["lat"]) <= 1e35)
+    ds["lon"] = ds["lon"].where(abs(ds["lon"]) <= 1000)
+    ds["lat"] = ds["lat"].where(abs(ds["lat"]) <= 1000)
 
     # adjust lon convention
     lon = ds["lon"].where(ds["lon"] > 0, 360 + ds["lon"])

diff --git a/cmip6_preprocessing/tests/test_preprocessing.py b/cmip6_preprocessing/tests/test_preprocessing.py
@@ -90,8 +90,9 @@ def test_promote_empty_dims():
     assert set(["x", "y", "z"]).issubset(set(ds_promoted.coords))
 
 
+@pytest.mark.parametrize("nans", [True, False])
 @pytest.mark.parametrize("dask", [True, False])
-def test_replace_x_y_nominal_lat_lon(dask):
+def test_replace_x_y_nominal_lat_lon(dask, nans):
     x = np.linspace(0, 720, 10)
     y = np.linspace(-200, 140, 5)
     lon = xr.DataArray(np.linspace(0, 360, len(x)), coords=[("x", x)])
@@ -104,21 +105,39 @@ def test_replace_x_y_nominal_lat_lon(dask):
     ds.coords["lon"] = llon
     ds.coords["lat"] = llat
 
+    if nans:
+        lon = ds["lon"].load().data
+        lon[0, :] = np.nan
+        lon[-1, :] = np.nan
+        lon[:, 0] = np.nan
+        lon[:, -1] = np.nan
+        lon[15:23, 23:26] = np.nan
+        ds["lon"].data = lon
+
+        # for lats put only some nans in the middle.
+        # I currently have no way to interpolate lats at the edge.
+        lat = ds["lat"].load().data
+        lat[15:23, 23:26] = np.nan
+        ds["lat"].data = lat
+
     if dask:
         ds = ds.chunk({"x": -1, "y": -1})
         ds.coords["lon"] = ds.coords["lon"].chunk({"x": -1, "y": -1})
         ds.coords["lat"] = ds.coords["lat"].chunk({"x": -1, "y": -1})
 
     replaced_ds = replace_x_y_nominal_lat_lon(ds)
 
-    np.testing.assert_allclose(replaced_ds.x, lon)
-    np.testing.assert_allclose(replaced_ds.y, lat)
+    assert all(~np.isnan(replaced_ds.x))
+    assert all(~np.isnan(replaced_ds.y))
+
     assert all(replaced_ds.x.diff("x") > 0)
     assert all(replaced_ds.y.diff("y") > 0)
     assert len(replaced_ds.lon.shape) == 2
     assert len(replaced_ds.lat.shape) == 2
     assert set(replaced_ds.lon.dims) == set(["x", "y"])
     assert set(replaced_ds.lat.dims) == set(["x", "y"])
+    assert all(~np.isnan(replaced_ds.x))
+    assert all(~np.isnan(replaced_ds.y))
 
     # test a dataset that would result in duplicates with current method
     x = np.linspace(0, 720, 4)
@@ -142,6 +161,8 @@ def test_replace_x_y_nominal_lat_lon(dask):
         ds.coords["lat"] = ds.coords["lat"].chunk({"x": -1, "y": -1})
 
     replaced_ds = replace_x_y_nominal_lat_lon(ds)
+    assert all(~np.isnan(replaced_ds.x))
+    assert all(~np.isnan(replaced_ds.y))
     assert len(replaced_ds.y) == len(np.unique(replaced_ds.y))
     assert len(replaced_ds.x) == len(np.unique(replaced_ds.x))
     # make sure values are sorted in ascending order
@@ -214,7 +235,7 @@ def test_parse_lon_lat_bounds():
         assert "time" not in ds_test2.variables
 
 
-@pytest.mark.parametrize("missing_values", [False, 1e36, -1e36])
+@pytest.mark.parametrize("missing_values", [False, 1e36, -1e36, 1001, -1001])
 @pytest.mark.parametrize(
     "shift",
     [
@@ -234,6 +255,7 @@ def test_correct_lon(missing_values, shift):
         lon = ds["lon"].load().data
         lon[10:20, 10:20] = missing_values
         ds["lon"].data = lon
+
     ds_lon_corrected = correct_lon(ds)
     assert ds_lon_corrected.lon.min() >= 0
     assert ds_lon_corrected.lon.max() <= 360

diff --git a/cmip6_preprocessing/tests/test_preprocessing_cloud.py b/cmip6_preprocessing/tests/test_preprocessing_cloud.py
@@ -13,7 +13,7 @@
 
 pytest.importorskip("gcsfs")
 
-# test_models = ["CESM2-FV", "GFDL-ESM4", "GFDL-CM4", "CanESM5"]
+# test_models = ["CESM2-FV2", "GFDL-CM4"]
 test_models = all_models()
 
 
@@ -47,15 +47,13 @@ def spec_check_dim_coord_values_wo_intake(request, gl, vi, ei):
         ("AWI-CM-1-1-MR", "thetao", "historical", "gn"),
         ("AWI-CM-1-1-MR", "thetao", "ssp585", "gn"),
         # TODO: would be nice to have a "*" matching...
-        ("CESM2-FV2", "thetao", "historical", "gn"),
         # (
         #     "GFDL-CM4",
         #     "thetao",
         #     "historical",
         #     "gn",
         # ),  # this should not fail and should trigger an xpass (I just use this for dev purposes to check
         #     # the strict option)
-        ("CESM2-FV2", "thetao", "ssp585", "gn"),
     ]
     spec = (request.param, vi, ei, gl)
     request.param = spec
@@ -96,7 +94,7 @@ def test_check_dim_coord_values_wo_intake(
             diagnose_doubles(ds[di].load().data)
             assert len(ds[di]) == len(np.unique(ds[di]))
             if di != "time":  # these tests do not make sense for decoded time
-                assert ~np.all(np.isnan(ds[di]))
+                assert np.all(~np.isnan(ds[di]))
                 assert np.all(ds[di].diff(di) >= 0)
 
     assert ds.lon.min().load() >= 0
@@ -108,6 +106,7 @@ def test_check_dim_coord_values_wo_intake(
     assert ds.lat.max().load() <= 90
     # make sure lon and lat are 2d
     assert len(ds.lon.shape) == 2
+    assert len(ds.lat.shape) == 2
 
 
 # this fixture has to be redifined every time to account for different fail cases for each test
@@ -119,8 +118,6 @@ def spec_check_dim_coord_values(request, gl, vi, ei):
         ("AWI-CM-1-1-MR", "thetao", "historical", "gn"),
         ("AWI-CM-1-1-MR", "thetao", "ssp585", "gn"),
         # TODO: would be nice to have a "*" matching...
-        ("CESM2-FV2", "thetao", "historical", "gn"),
-        ("CESM2-FV2", "thetao", "ssp585", "gn"),
         (
             "IPSL-CM6A-LR",
             "thetao",
@@ -167,7 +164,7 @@ def test_check_dim_coord_values(
             diagnose_doubles(ds[di].load().data)
             assert len(ds[di]) == len(np.unique(ds[di]))
             if di != "time":  # these tests do not make sense for decoded time
-                assert ~np.all(np.isnan(ds[di]))
+                assert np.all(~np.isnan(ds[di]))
                 assert np.all(ds[di].diff(di) >= 0)
 
     assert ds.lon.min().load() >= 0
@@ -194,7 +191,6 @@ def spec_check_bounds_verticies(request, gl, vi, ei):
         ("AWI-ESM-1-1-MR", "thetao", "ssp585", "gn"),
         ("AWI-CM-1-1-MR", "thetao", "historical", "gn"),
         ("AWI-CM-1-1-MR", "thetao", "ssp585", "gn"),
-        ("CESM2-FV2", "thetao", "historical", "gn"),
         ("FGOALS-f3-L", "thetao", "historical", "gn"),
         ("FGOALS-f3-L", "thetao", "ssp585", "gn"),
         ("FGOALS-g3", "thetao", "historical", "gn"),
@@ -284,7 +280,6 @@ def spec_check_grid(request, gl, vi, ei):
         ("AWI-ESM-1-1-MR", "thetao", "ssp585", "gn"),
         ("AWI-CM-1-1-MR", "thetao", "historical", "gn"),
         ("AWI-CM-1-1-MR", "thetao", "ssp585", "gn"),
-        ("CESM2-FV2", "thetao", "historical", "gn"),
         ("CMCC-CM2-SR5", "thetao", "historical", "gn"),
         ("CMCC-CM2-SR5", "thetao", "ssp585", "gn"),
         ("FGOALS-f3-L", "thetao", "historical", "gn"),

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -10,6 +10,9 @@ v0.2.0 (unreleased)
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
+- Further refactor of `replace_x_y_nominal_lat_lon`, which avoids missing values in the dimension coordinates (:issue:`66`) (:pull:`79`)
+By `Julius Busecke <https://github.com/jbusecke>`_
+
 - Consistent treatment of cf-style bounds. The combination of `parse_lon_lat_bounds`,
 `maybe_convert_bounds_to_vertex`, `maybe_convert_vertex_to_bounds`, and `sort_vertex_order` applied on the dataset, assures
 that all datasets have both conventions available and the vertex order is the same.