-
Notifications
You must be signed in to change notification settings - Fork 286
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add cube co-realisation. #2967
Add cube co-realisation. #2967
Changes from 4 commits
13bdf2e
499972c
926d2db
4f2b226
6345c29
d508335
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
* Added new function :func:`iris.co_realise_cubes` to compute multiple lazy | ||
values in a single operation, avoiding repeated re-loading of data or | ||
re-calculation of expressions. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# (C) British Crown Copyright 2017, Met Office | ||
# (C) British Crown Copyright 2017 - 2018, Met Office | ||
# | ||
# This file is part of Iris. | ||
# | ||
|
@@ -102,6 +102,33 @@ def as_lazy_data(data, chunks=None, asarray=False): | |
return data | ||
|
||
|
||
def _co_realise_lazy_arrays(arrays): | ||
""" | ||
Compute multiple lazy arrays and return a list of real values. | ||
|
||
All the arrays are computed together, so they can share results for common | ||
graph elements. | ||
|
||
Also converts any MaskedConstants returned into masked arrays, to ensure | ||
that all return values are writeable NumPy array objects. | ||
|
||
""" | ||
computed_arrays = da.compute(*arrays) | ||
results = [] | ||
for lazy_in, real_out in zip(arrays, computed_arrays): | ||
real_out = np.asanyarray(real_out) | ||
if isinstance(real_out, ma.core.MaskedConstant): | ||
# Convert any masked constants into NumPy masked arrays : In some | ||
# cases dask may return a scalar numpy.int/numpy.float object | ||
# rather than a numpy.ndarray object. | ||
# Recorded in https://github.com/dask/dask/issues/2111. | ||
# NOTE: also in this case, apply the original lazy-array dtype. | ||
real_out = ma.masked_array(real_out.data, mask=real_out.mask, | ||
dtype=lazy_in.dtype) | ||
results.append(real_out) | ||
return results | ||
|
||
|
||
def as_concrete_data(data): | ||
""" | ||
Return the actual content of a lazy array, as a numpy array. | ||
|
@@ -120,14 +147,7 @@ def as_concrete_data(data): | |
|
||
""" | ||
if is_lazy_data(data): | ||
# Realise dask array, ensuring the data result is always a NumPy array. | ||
# In some cases dask may return a scalar numpy.int/numpy.float object | ||
# rather than a numpy.ndarray object. | ||
# Recorded in https://github.com/dask/dask/issues/2111. | ||
dtype = data.dtype | ||
data = np.asanyarray(data.compute()) | ||
if isinstance(data, ma.core.MaskedConstant): | ||
data = ma.masked_array(data.data, dtype=dtype, mask=data.mask) | ||
data, = _co_realise_lazy_arrays([data]) | ||
|
||
return data | ||
|
||
|
@@ -158,3 +178,18 @@ def multidim_lazy_stack(stack): | |
result = da.stack([multidim_lazy_stack(subarray) | ||
for subarray in stack]) | ||
return result | ||
|
||
|
||
def co_realise_cubes(cubes): | ||
""" | ||
Fetch real data for multiple cubes at one time. | ||
|
||
This fetches lazy content, equivalent to accessing each cube.data. | ||
However, lazy calculations and data fetches can be shared between the | ||
computations, improving performance. | ||
|
||
""" | ||
results = _co_realise_lazy_arrays([cube.core_data() for cube in cubes]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like we should allow a combination of lazy and realised cubes to be passed to this method, and in fact I think the current code allows that ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right it absolutely was in my mind that it does do this. |
||
for cube, result in zip(cubes, results): | ||
cube.data = result | ||
return cubes |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# (C) British Crown Copyright 2018, Met Office | ||
# | ||
# This file is part of Iris. | ||
# | ||
# Iris is free software: you can redistribute it and/or modify it under | ||
# the terms of the GNU Lesser General Public License as published by the | ||
# Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# Iris is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU Lesser General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU Lesser General Public License | ||
# along with Iris. If not, see <http://www.gnu.org/licenses/>. | ||
"""Test function :func:`iris._lazy data.co_realise_cubes`.""" | ||
|
||
from __future__ import (absolute_import, division, print_function) | ||
from six.moves import (filter, input, map, range, zip) # noqa | ||
|
||
# Import iris.tests first so that some things can be initialised before | ||
# importing anything else. | ||
import iris.tests as tests | ||
|
||
from mock import MagicMock | ||
import numpy as np | ||
|
||
from iris.cube import Cube | ||
from iris._lazy_data import as_lazy_data | ||
|
||
from iris._lazy_data import co_realise_cubes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
|
||
|
||
class ArrayAccessCounter(object): | ||
def __init__(self, array): | ||
self.dtype = array.dtype | ||
self.shape = array.shape | ||
self._array = array | ||
self.access_count = 0 | ||
|
||
def __getitem__(self, keys): | ||
self.access_count += 1 | ||
return self._array[keys] | ||
|
||
|
||
class Test_co_realise_cubes(tests.IrisTest): | ||
def test_empty(self): | ||
self.assertEqual(co_realise_cubes([]), []) | ||
|
||
def test_basic(self): | ||
real_data = np.arange(3.) | ||
cube = Cube(as_lazy_data(real_data)) | ||
self.assertTrue(cube.has_lazy_data()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it's necessary to make this assertion here; you're not testing this functionality. (Also in the test below). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
result, = co_realise_cubes([cube]) | ||
self.assertEqual(result, cube) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually it makes me think: should There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. assertIs : 👍 |
||
self.assertFalse(cube.has_lazy_data()) | ||
self.assertArrayAllClose(cube.core_data(), real_data) | ||
|
||
def test_multi(self): | ||
real_data = np.arange(3.) | ||
cube = Cube(as_lazy_data(real_data)) | ||
self.assertTrue(cube.has_lazy_data()) | ||
cube_2 = cube + 1 | ||
cube_3 = cube + 2 | ||
cubes = [cube, cube_2, cube_3] | ||
for cube in cubes: | ||
self.assertTrue(cube.has_lazy_data()) | ||
results = co_realise_cubes(cubes) | ||
self.assertEqual(results, cubes) | ||
for cube in cubes: | ||
self.assertFalse(cube.has_lazy_data()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My comments above also apply to this test. |
||
|
||
def test_combined_access(self): | ||
wrapped_array = ArrayAccessCounter(np.arange(3.)) | ||
lazy_array = as_lazy_data(wrapped_array) | ||
derived_a = lazy_array + 1 | ||
derived_b = lazy_array + 2 | ||
cube_a = Cube(derived_a) | ||
cube_b = Cube(derived_b) | ||
co_realise_cubes([cube_a, cube_b]) | ||
# Though used twice, the source data should only get fetched once. | ||
self.assertEqual(wrapped_array.access_count, 1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice test! |
||
|
||
|
||
if __name__ == '__main__': | ||
tests.main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The above comment isn't related to MaskedConstants specifically, but rather the (lack of) distinction in NumPy between scalar values (e.g.
np.float64(1.)
) and scalar arrays (e.g.np.array(1., dtype=np.float64)
). It refers to theasanyarray()
at line 119.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I though so, but frankly I wasn't totally certain !
This comment is basically cut+paste from what was formerly in the as_concrete_data function.
I will amend ...