From 13bdf2e5a090a8a6f6ad59cd1f9090ee7f400c52 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 7 Mar 2018 11:21:45 +0000 Subject: [PATCH 1/6] Add cube co-realisation. --- lib/iris/__init__.py | 3 ++- lib/iris/_lazy_data.py | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/lib/iris/__init__.py b/lib/iris/__init__.py index 4a9fa43800..24c205aefb 100644 --- a/lib/iris/__init__.py +++ b/lib/iris/__init__.py @@ -112,6 +112,7 @@ def callback(cube, field, filename): from iris._deprecation import IrisDeprecation, warn_deprecated import iris.fileformats import iris.io +from iris._lazy_data import co_realise_cubes try: @@ -127,7 +128,7 @@ def callback(cube, field, filename): __all__ = ['load', 'load_cube', 'load_cubes', 'load_raw', 'save', 'Constraint', 'AttributeConstraint', 'sample_data_path', 'site_configuration', 'Future', 'FUTURE', - 'IrisDeprecation'] + 'IrisDeprecation', 'co_realise_cubes'] Constraint = iris._constraints.Constraint diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 15dac90e7f..83b4873aaf 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -1,4 +1,4 @@ -# (C) British Crown Copyright 2017, Met Office +# (C) British Crown Copyright 2017 - 2018, Met Office # # This file is part of Iris. # @@ -158,3 +158,18 @@ def multidim_lazy_stack(stack): result = da.stack([multidim_lazy_stack(subarray) for subarray in stack]) return result + + +def co_realise_cubes(cubes): + """ + Fetch real data for multiple cubes at one time. + + This fetches lazy content, equivalent to accessing each cube.data. + However, lazy calculations and data fetches can be shared between the + calculations, improving performance. + + """ + results = da.compute(list(cube.core_data() for cube in cubes)) + for cube, result in zip(cubes, results): + cube.data = result + return cubes From 499972c31a9300da6dfcf44870aa430af787269b Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 7 Mar 2018 23:34:24 +0000 Subject: [PATCH 2/6] Add tests, common code for masked constant conversion. --- lib/iris/_lazy_data.py | 34 ++++++-- .../unit/lazy_data/test_co_realise_cubes.py | 87 +++++++++++++++++++ 2 files changed, 112 insertions(+), 9 deletions(-) create mode 100644 lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 83b4873aaf..0012274327 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -102,6 +102,29 @@ def as_lazy_data(data, chunks=None, asarray=False): return data +def _co_realise_lazy_arrays(arrays): + """ + Compute multiple lazy arrays together + return a list of real values. + + Also converts any MaskedConstants to arrays, to ensure that the dtypes of + the results are the same as the inputs. + This part is only necessary because of problems with masked constants. + + """ + results = list(da.compute(*arrays)) + for i_array, (array, result) in enumerate(zip(arrays, results)): + if isinstance(result, ma.core.MaskedConstant): + # Convert any masked constants into NumPy masked arrays : In some + # cases dask may return a scalar numpy.int/numpy.float object + # rather than a numpy.ndarray object. + # Recorded in https://github.com/dask/dask/issues/2111. + result = ma.masked_array(result.data, mask=result.mask, + dtype=array.dtype) + # Replace the original result array. + results[i_array] = result + return results + + def as_concrete_data(data): """ Return the actual content of a lazy array, as a numpy array. @@ -120,14 +143,7 @@ def as_concrete_data(data): """ if is_lazy_data(data): - # Realise dask array, ensuring the data result is always a NumPy array. - # In some cases dask may return a scalar numpy.int/numpy.float object - # rather than a numpy.ndarray object. - # Recorded in https://github.com/dask/dask/issues/2111. - dtype = data.dtype - data = np.asanyarray(data.compute()) - if isinstance(data, ma.core.MaskedConstant): - data = ma.masked_array(data.data, dtype=dtype, mask=data.mask) + data, = _co_realise_lazy_arrays([data]) return data @@ -169,7 +185,7 @@ def co_realise_cubes(cubes): calculations, improving performance. """ - results = da.compute(list(cube.core_data() for cube in cubes)) + results = _co_realise_lazy_arrays([cube.core_data() for cube in cubes]) for cube, result in zip(cubes, results): cube.data = result return cubes diff --git a/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py b/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py new file mode 100644 index 0000000000..0aa8c809e4 --- /dev/null +++ b/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py @@ -0,0 +1,87 @@ +# (C) British Crown Copyright 2018, Met Office +# +# This file is part of Iris. +# +# Iris is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the +# Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Iris is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with Iris. If not, see . +"""Test function :func:`iris._lazy data.co_realise_cubes`.""" + +from __future__ import (absolute_import, division, print_function) +from six.moves import (filter, input, map, range, zip) # noqa + +# Import iris.tests first so that some things can be initialised before +# importing anything else. +import iris.tests as tests + +from mock import MagicMock +import numpy as np + +from iris.cube import Cube +from iris._lazy_data import as_lazy_data + +from iris._lazy_data import co_realise_cubes + + +class ArrayAccessCounter(object): + def __init__(self, array): + self.dtype = array.dtype + self.shape = array.shape + self._array = array + self.access_count = 0 + + def __getitem__(self, keys): + self.access_count += 1 + return self._array[keys] + + +class Test_co_realise_cubes(tests.IrisTest): + def test_empty(self): + self.assertEqual(co_realise_cubes([]), []) + + def test_basic(self): + real_data = np.arange(3.) + cube = Cube(as_lazy_data(real_data)) + self.assertTrue(cube.has_lazy_data()) + result, = co_realise_cubes([cube]) + self.assertEqual(result, cube) + self.assertFalse(cube.has_lazy_data()) + self.assertArrayAllClose(cube.core_data(), real_data) + + def test_multi(self): + real_data = np.arange(3.) + cube = Cube(as_lazy_data(real_data)) + self.assertTrue(cube.has_lazy_data()) + cube_2 = cube + 1 + cube_3 = cube + 2 + cubes = [cube, cube_2, cube_3] + for cube in cubes: + self.assertTrue(cube.has_lazy_data()) + results = co_realise_cubes(cubes) + self.assertEqual(results, cubes) + for cube in cubes: + self.assertFalse(cube.has_lazy_data()) + + def test_combined_access(self): + wrapped_array = ArrayAccessCounter(np.arange(3.)) + lazy_array = as_lazy_data(wrapped_array) + derived_a = lazy_array + 1 + derived_b = lazy_array + 2 + cube_a = Cube(derived_a) + cube_b = Cube(derived_b) + co_realise_cubes([cube_a, cube_b]) + # Though used twice, the source data should only get fetched once. + self.assertEqual(wrapped_array.access_count, 1) + + +if __name__ == '__main__': + tests.main() From 926d2db8fc863d31e142731aa982ea86123b667b Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 8 Mar 2018 00:07:07 +0000 Subject: [PATCH 3/6] Added whatsnew for co_realise_cubes. --- .../newfeature_2018-Mar-08_co_realise_cubes.txt | 3 +++ lib/iris/_lazy_data.py | 14 ++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) create mode 100644 docs/iris/src/whatsnew/contributions_2.1/newfeature_2018-Mar-08_co_realise_cubes.txt diff --git a/docs/iris/src/whatsnew/contributions_2.1/newfeature_2018-Mar-08_co_realise_cubes.txt b/docs/iris/src/whatsnew/contributions_2.1/newfeature_2018-Mar-08_co_realise_cubes.txt new file mode 100644 index 0000000000..274ff0de5d --- /dev/null +++ b/docs/iris/src/whatsnew/contributions_2.1/newfeature_2018-Mar-08_co_realise_cubes.txt @@ -0,0 +1,3 @@ +* Added new function :func:`iris.co_realise_cubes` to compute multiple lazy + values in a single operation, avoiding repeated re-loading of data or + re-calculation of expressions. diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 0012274327..6ee5a9cf45 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -104,11 +104,13 @@ def as_lazy_data(data, chunks=None, asarray=False): def _co_realise_lazy_arrays(arrays): """ - Compute multiple lazy arrays together + return a list of real values. + Compute multiple lazy arrays and return a list of real values. - Also converts any MaskedConstants to arrays, to ensure that the dtypes of - the results are the same as the inputs. - This part is only necessary because of problems with masked constants. + All the arrays are computed together, so they can share results from common + graph elements. + + Also converts any MaskedConstants returned into masked arrays, to ensure + that all return values are writeable NumPy array objects. """ results = list(da.compute(*arrays)) @@ -120,7 +122,7 @@ def _co_realise_lazy_arrays(arrays): # Recorded in https://github.com/dask/dask/issues/2111. result = ma.masked_array(result.data, mask=result.mask, dtype=array.dtype) - # Replace the original result array. + # When we change one, update the result list. results[i_array] = result return results @@ -182,7 +184,7 @@ def co_realise_cubes(cubes): This fetches lazy content, equivalent to accessing each cube.data. However, lazy calculations and data fetches can be shared between the - calculations, improving performance. + computations, improving performance. """ results = _co_realise_lazy_arrays([cube.core_data() for cube in cubes]) From 4f2b2266a074bb5cebcd8282cb2e20e0ec3c250e Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 8 Mar 2018 00:43:25 +0000 Subject: [PATCH 4/6] Fix to ensure all dm.data results are ndarrays. --- lib/iris/_lazy_data.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 6ee5a9cf45..8998af853f 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -106,24 +106,26 @@ def _co_realise_lazy_arrays(arrays): """ Compute multiple lazy arrays and return a list of real values. - All the arrays are computed together, so they can share results from common + All the arrays are computed together, so they can share results for common graph elements. Also converts any MaskedConstants returned into masked arrays, to ensure that all return values are writeable NumPy array objects. """ - results = list(da.compute(*arrays)) - for i_array, (array, result) in enumerate(zip(arrays, results)): - if isinstance(result, ma.core.MaskedConstant): + computed_arrays = da.compute(*arrays) + results = [] + for lazy_in, real_out in zip(arrays, computed_arrays): + real_out = np.asanyarray(real_out) + if isinstance(real_out, ma.core.MaskedConstant): # Convert any masked constants into NumPy masked arrays : In some # cases dask may return a scalar numpy.int/numpy.float object # rather than a numpy.ndarray object. # Recorded in https://github.com/dask/dask/issues/2111. - result = ma.masked_array(result.data, mask=result.mask, - dtype=array.dtype) - # When we change one, update the result list. - results[i_array] = result + # NOTE: also in this case, apply the original lazy-array dtype. + real_out = ma.masked_array(real_out.data, mask=real_out.mask, + dtype=lazy_in.dtype) + results.append(real_out) return results From 6345c29094bfc703ef09756d70c9faff3b35a068 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 8 Mar 2018 14:33:11 +0000 Subject: [PATCH 5/6] Review changes; comments and docstrings review; no return values. --- lib/iris/_lazy_data.py | 49 ++++++++++++++----- .../unit/lazy_data/test_co_realise_cubes.py | 31 ++++++------ 2 files changed, 53 insertions(+), 27 deletions(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 8998af853f..d043a5fda1 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -109,20 +109,26 @@ def _co_realise_lazy_arrays(arrays): All the arrays are computed together, so they can share results for common graph elements. - Also converts any MaskedConstants returned into masked arrays, to ensure - that all return values are writeable NumPy array objects. + Casts all results with `np.asanyarray`, and converts any MaskedConstants + appearing into masked arrays, to ensure that all return values are + writeable NumPy array objects. + + Any non-lazy arrays are passed through, as they are by `da.compute`. + They undergo the same result standardisation. """ computed_arrays = da.compute(*arrays) results = [] for lazy_in, real_out in zip(arrays, computed_arrays): + # Ensure we always have arrays. + # Note : in some cases dask (and numpy) will return a scalar + # numpy.int/numpy.float object rather than an ndarray. + # Recorded in https://github.com/dask/dask/issues/2111. real_out = np.asanyarray(real_out) if isinstance(real_out, ma.core.MaskedConstant): - # Convert any masked constants into NumPy masked arrays : In some - # cases dask may return a scalar numpy.int/numpy.float object - # rather than a numpy.ndarray object. - # Recorded in https://github.com/dask/dask/issues/2111. - # NOTE: also in this case, apply the original lazy-array dtype. + # Convert any masked constants into NumPy masked arrays. + # NOTE: in this case, also apply the original lazy-array dtype, as + # masked constants *always* have dtype float64. real_out = ma.masked_array(real_out.data, mask=real_out.mask, dtype=lazy_in.dtype) results.append(real_out) @@ -180,16 +186,37 @@ def multidim_lazy_stack(stack): return result -def co_realise_cubes(cubes): +def co_realise_cubes(*cubes): """ - Fetch real data for multiple cubes at one time. + Fetch 'real' data for multiple cubes, in a shared calculation. - This fetches lazy content, equivalent to accessing each cube.data. + This computes any lazy data, equivalent to accessing each `cube.data`. However, lazy calculations and data fetches can be shared between the computations, improving performance. + Args: + + * cubes : (list of `~iris.cube.Cube`) + Arguments, each of which is a cube to be realised. + + For example:: + + # Form stats. + a_std = cube_a.collapsed(['x', 'y'], iris.analysis.STD_DEV) + b_std = cube_b.collapsed(['x', 'y'], iris.analysis.STD_DEV) + ab_mean_diff = (cube_b - cube_a).collapsed(['x', 'y'], + iris.analysis.MEAN) + std_err = (a_std * a_std + b_std * b_std) ** 0.5 + + # Compute stats together (to avoid multiple data passes). + iris.co_realise_cubes(a_std, b_std, ab_mean_diff, std_err) + + + .. Note:: + + Cubes with non-lazy data may also be passed, with no ill effect. + """ results = _co_realise_lazy_arrays([cube.core_data() for cube in cubes]) for cube, result in zip(cubes, results): cube.data = result - return cubes diff --git a/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py b/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py index 0aa8c809e4..03782cda85 100644 --- a/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py +++ b/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py @@ -46,30 +46,29 @@ def __getitem__(self, keys): class Test_co_realise_cubes(tests.IrisTest): def test_empty(self): - self.assertEqual(co_realise_cubes([]), []) + # Ensure that 'no args' case does not raise an error. + co_realise_cubes() def test_basic(self): real_data = np.arange(3.) cube = Cube(as_lazy_data(real_data)) - self.assertTrue(cube.has_lazy_data()) - result, = co_realise_cubes([cube]) - self.assertEqual(result, cube) + co_realise_cubes(cube) self.assertFalse(cube.has_lazy_data()) self.assertArrayAllClose(cube.core_data(), real_data) def test_multi(self): real_data = np.arange(3.) - cube = Cube(as_lazy_data(real_data)) - self.assertTrue(cube.has_lazy_data()) - cube_2 = cube + 1 - cube_3 = cube + 2 - cubes = [cube, cube_2, cube_3] - for cube in cubes: - self.assertTrue(cube.has_lazy_data()) - results = co_realise_cubes(cubes) - self.assertEqual(results, cubes) - for cube in cubes: - self.assertFalse(cube.has_lazy_data()) + cube_base = Cube(as_lazy_data(real_data)) + cube_inner = cube_base + 1 + result_a = cube_base + 1 + result_b = cube_inner + 1 + co_realise_cubes(result_a, result_b) + # Check that target cubes were realised. + self.assertFalse(result_a.has_lazy_data()) + self.assertFalse(result_b.has_lazy_data()) + # Check that other cubes referenced remain lazy. + self.assertTrue(cube_base.has_lazy_data()) + self.assertTrue(cube_inner.has_lazy_data()) def test_combined_access(self): wrapped_array = ArrayAccessCounter(np.arange(3.)) @@ -78,7 +77,7 @@ def test_combined_access(self): derived_b = lazy_array + 2 cube_a = Cube(derived_a) cube_b = Cube(derived_b) - co_realise_cubes([cube_a, cube_b]) + co_realise_cubes(cube_a, cube_b) # Though used twice, the source data should only get fetched once. self.assertEqual(wrapped_array.access_count, 1) From d5083354191e801232339b97fedb312c68c9ea30 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 8 Mar 2018 15:21:45 +0000 Subject: [PATCH 6/6] Fix docstring. --- lib/iris/_lazy_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index d043a5fda1..285aa8d9b1 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -196,7 +196,7 @@ def co_realise_cubes(*cubes): Args: - * cubes : (list of `~iris.cube.Cube`) + * cubes (list of :class:`~iris.cube.Cube`): Arguments, each of which is a cube to be realised. For example::