From b8d6daffba4cd99b75ad0e4ae1a116bece754c2b Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sun, 9 May 2021 23:02:53 +0100 Subject: [PATCH 1/9] initial changes --- xarray/core/computation.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index e12938d6965..3af5ebacf2c 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1327,7 +1327,23 @@ def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None): # 2. Ignore the nans valid_values = da_a.notnull() & da_b.notnull() - if not valid_values.all(): + def _nan_check(d): + if d.all(): + return True + else: + return False + + if is_duck_dask_array(valid_values.data): + # assign to copy - else the check is not triggered + _are_there_nans = valid_values.copy( + data=valid_values.data.map_blocks(_nan_check, dtype=valid_values.dtype), + deep=False, + ) + + else: + _are_there_nans = _nan_check(valid_values.data) + + if not _are_there_nans: da_a = da_a.where(valid_values) da_b = da_b.where(valid_values) From 227b6225ceb6992b6ba6e3b9eeb0458302723e7f Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Sun, 16 May 2021 18:27:42 +0100 Subject: [PATCH 2/9] Using map_blocks to lazily mask input arrays, following #4559 --- xarray/core/computation.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 3af5ebacf2c..e92d332712e 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1326,28 +1326,23 @@ def _cov_corr(da_a, da_b, dim=None, ddof=0, method=None): # 2. Ignore the nans valid_values = da_a.notnull() & da_b.notnull() + valid_count = valid_values.sum(dim) - ddof - def _nan_check(d): - if d.all(): - return True + def _get_valid_values(da, other): + """ + Function to lazily mask da_a and da_b + following a similar approach to + https://github.com/pydata/xarray/pull/4559 + """ + missing_vals = np.logical_or(da.isnull(), other.isnull()) + if missing_vals.any(): + da = da.where(~missing_vals) + return da else: - return False - - if is_duck_dask_array(valid_values.data): - # assign to copy - else the check is not triggered - _are_there_nans = valid_values.copy( - data=valid_values.data.map_blocks(_nan_check, dtype=valid_values.dtype), - deep=False, - ) - - else: - _are_there_nans = _nan_check(valid_values.data) + return da - if not _are_there_nans: - da_a = da_a.where(valid_values) - da_b = da_b.where(valid_values) - - valid_count = valid_values.sum(dim) - ddof + da_a = da_a.map_blocks(_get_valid_values, args=[da_b]) + da_b = da_b.map_blocks(_get_valid_values, args=[da_a]) # 3. Detrend along the given dim demeaned_da_a = da_a - da_a.mean(dim=dim) From 6619a6c33ff772d826b3433f8ab527a5c351692c Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Wed, 26 May 2021 10:17:34 +0100 Subject: [PATCH 3/9] Adding lazy corr cov test with `raise_if_dask_computes` --- xarray/tests/test_computation.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index c76633de831..66367cca796 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -22,7 +22,7 @@ unified_dim_sizes, ) -from . import has_dask, requires_dask +from . import has_dask, raise_if_dask_computes, requires_dask dask = pytest.importorskip("dask") @@ -1004,6 +1004,24 @@ def arrays_w_tuples(): return arrays, array_tuples +@pytest.mark.parametrize("ddof", [0, 1]) +@pytest.mark.parametrize( + "da_a, da_b", + [arrays_w_tuples()[1][3], arrays_w_tuples()[1][4], arrays_w_tuples()[1][5]], +) +@pytest.mark.parametrize("dim", [None, "x", "time"]) +def test_lazy_corrcov(da_a, da_b, dim, ddof): + # GH 5284 + from dask import is_dask_collection + + with raise_if_dask_computes(): + cov = xr.cov(da_a.chunk(), da_b.chunk(), dim=dim, ddof=ddof) + assert is_dask_collection(cov) + + corr = xr.corr(da_a.chunk(), da_b.chunk(), dim=dim) + assert is_dask_collection(corr) + + @pytest.mark.parametrize("ddof", [0, 1]) @pytest.mark.parametrize( "da_a, da_b", From 637ab8060223a5b186fd78dcdf683bc0c8a18b66 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Wed, 26 May 2021 19:05:12 +0100 Subject: [PATCH 4/9] adding test for one da without nans --- xarray/tests/test_computation.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 66367cca796..ae2ee233362 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -990,6 +990,7 @@ def arrays_w_tuples(): da.isel(time=range(2, 20)).rolling(time=3, center=True).mean(), xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]), xr.DataArray([[1, 2], [np.nan, np.nan]], dims=["x", "time"]), + xr.DataArray([[1, 2], [2, 1]], dims=["x", "time"]), ] array_tuples = [ @@ -998,6 +999,7 @@ def arrays_w_tuples(): (arrays[1], arrays[1]), (arrays[2], arrays[2]), (arrays[2], arrays[3]), + (arrays[2], arrays[4]), (arrays[3], arrays[3]), ] @@ -1007,7 +1009,12 @@ def arrays_w_tuples(): @pytest.mark.parametrize("ddof", [0, 1]) @pytest.mark.parametrize( "da_a, da_b", - [arrays_w_tuples()[1][3], arrays_w_tuples()[1][4], arrays_w_tuples()[1][5]], + [ + arrays_w_tuples()[1][3], + arrays_w_tuples()[1][4], + arrays_w_tuples()[1][5], + arrays_w_tuples()[1][6], + ], ) @pytest.mark.parametrize("dim", [None, "x", "time"]) def test_lazy_corrcov(da_a, da_b, dim, ddof): From fff455644ecdd12e2adb9a8b4612ad511ee71b25 Mon Sep 17 00:00:00 2001 From: Andrew Williams <56925856+AndrewWilliams3142@users.noreply.github.com> Date: Wed, 26 May 2021 19:38:10 +0100 Subject: [PATCH 5/9] checking ordering of arrays doesnt matter Co-authored-by: Deepak Cherian --- xarray/tests/test_computation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index ae2ee233362..97abe7fe7c6 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1000,6 +1000,7 @@ def arrays_w_tuples(): (arrays[2], arrays[2]), (arrays[2], arrays[3]), (arrays[2], arrays[4]), + (arrays[4], arrays[2]), (arrays[3], arrays[3]), ] From 2756488fe0081d6b240d84bff8a028f3a7dd8c6c Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Wed, 26 May 2021 19:38:51 +0100 Subject: [PATCH 6/9] adjust inputs to test --- xarray/tests/test_computation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 97abe7fe7c6..f226523ae5f 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1015,6 +1015,7 @@ def arrays_w_tuples(): arrays_w_tuples()[1][4], arrays_w_tuples()[1][5], arrays_w_tuples()[1][6], + arrays_w_tuples()[1][7], ], ) @pytest.mark.parametrize("dim", [None, "x", "time"]) From 58c441d434a4a7e9125389aa4ddb385fb895fab1 Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Wed, 26 May 2021 22:06:39 +0100 Subject: [PATCH 7/9] add test for array with no missing values --- xarray/tests/test_computation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index f226523ae5f..0d41a22b9e4 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1002,6 +1002,7 @@ def arrays_w_tuples(): (arrays[2], arrays[4]), (arrays[4], arrays[2]), (arrays[3], arrays[3]), + (arrays[4], arrays[4]), ] return arrays, array_tuples @@ -1016,6 +1017,7 @@ def arrays_w_tuples(): arrays_w_tuples()[1][5], arrays_w_tuples()[1][6], arrays_w_tuples()[1][7], + arrays_w_tuples()[1][8], ], ) @pytest.mark.parametrize("dim", [None, "x", "time"]) From a9bd72343faa20eb2bbe0a0e1f6893d53045680c Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Thu, 27 May 2021 09:51:32 +0100 Subject: [PATCH 8/9] added whatsnew --- doc/whats-new.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 26c975c859e..184d55302c2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,7 +21,9 @@ v0.18.1 (unreleased) New Features ~~~~~~~~~~~~ - +- :py:func:`xarray.cov` and :py:func:`xarray.corr` now lazily check for missing + values if inputs are dask arrays (:issue:`4804`, :pull:`5284`). + By `Andrew Williams `_. Breaking changes ~~~~~~~~~~~~~~~~ From ccb5f67b5c8c35e5439a4e7ce1ad69d9400d29da Mon Sep 17 00:00:00 2001 From: Andrew Williams Date: Thu, 27 May 2021 10:05:13 +0100 Subject: [PATCH 9/9] fixing format issues --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 52ea378b8c8..b957ed4bab2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -28,7 +28,7 @@ New Features - :py:func:`xarray.cov` and :py:func:`xarray.corr` now lazily check for missing values if inputs are dask arrays (:issue:`4804`, :pull:`5284`). By `Andrew Williams `_. - + Breaking changes ~~~~~~~~~~~~~~~~