diff --git a/doc/conf.py b/doc/conf.py index 06986c70ca..9c42ec7eb8 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -418,7 +418,7 @@ (f'https://docs.esmvaltool.org/projects/esmvalcore/en/{rtd_version}/', None), 'esmvaltool': (f'https://docs.esmvaltool.org/en/{rtd_version}/', None), - 'iris': ('https://scitools.org.uk/iris/docs/latest/', None), + 'iris': ('https://scitools-iris.readthedocs.io/en/latest/', None), 'matplotlib': ('https://matplotlib.org/', None), 'numpy': ('https://numpy.org/doc/stable/', None), 'python': ('https://docs.python.org/3/', None), diff --git a/doc/recipe/preprocessor.rst b/doc/recipe/preprocessor.rst index 6a5b72e673..88ba4c9ff7 100644 --- a/doc/recipe/preprocessor.rst +++ b/doc/recipe/preprocessor.rst @@ -456,17 +456,17 @@ Missing values masks -------------------- Missing (masked) values can be a nuisance especially when dealing with -multimodel ensembles and having to compute multimodel statistics; different +multi-model ensembles and having to compute multi-model statistics; different numbers of missing data from dataset to dataset may introduce biases and -artificially assign more weight to the datasets that have less missing -data. This is handled in ESMValTool via the missing values masks: two types of -such masks are available, one for the multimodel case and another for the -single model case. +artificially assign more weight to the datasets that have less missing data. +This is handled in ESMValTool via the missing values masks: two types of such +masks are available, one for the multi-model case and another for the single +model case. -The multimodel missing values mask (``mask_fillvalues``) is a preprocessor step +The multi-model missing values mask (``mask_fillvalues``) is a preprocessor step that usually comes after all the single-model steps (regridding, area selection etc) have been performed; in a nutshell, it combines missing values masks from -individual models into a multimodel missing values mask; the individual model +individual models into a multi-model missing values mask; the individual model masks are built according to common criteria: the user chooses a time window in which missing data points are counted, and if the number of missing data points relative to the number of total data points in a window is less than a chosen @@ -492,11 +492,11 @@ See also :func:`esmvalcore.preprocessor.mask_fillvalues`. Common mask for multiple models ------------------------------- -It is possible to use ``mask_fillvalues`` to create a combined multimodel -mask (all the masks from all the analyzed models combined into a single -mask); for that purpose setting the ``threshold_fraction`` to 0 will not -discard any time windows, essentially keeping the original model masks and -combining them into a single mask; here is an example: +It is possible to use ``mask_fillvalues`` to create a combined multi-model mask +(all the masks from all the analyzed models combined into a single mask); for +that purpose setting the ``threshold_fraction`` to 0 will not discard any time +windows, essentially keeping the original model masks and combining them into a +single mask; here is an example: .. code-block:: yaml @@ -530,13 +530,12 @@ Horizontal regridding Regridding is necessary when various datasets are available on a variety of `lat-lon` grids and they need to be brought together on a common grid (for -various statistical operations e.g. multimodel statistics or for e.g. direct +various statistical operations e.g. multi-model statistics or for e.g. direct inter-comparison or comparison with observational datasets). Regridding is conceptually a very similar process to interpolation (in fact, the regridder engine uses interpolation and extrapolation, with various schemes). The primary difference is that interpolation is based on sample data points, while -regridding is based on the horizontal grid of another cube (the reference -grid). +regridding is based on the horizontal grid of another cube (the reference grid). The underlying regridding mechanism in ESMValTool uses the `cube.regrid() `_ @@ -651,28 +650,28 @@ Multi-model statistics ====================== Computing multi-model statistics is an integral part of model analysis and evaluation: individual models display a variety of biases depending on model -set-up, initial conditions, forcings and implementation; comparing model data -to observational data, these biases have a significantly lower statistical -impact when using a multi-model ensemble. ESMValTool has the capability of -computing a number of multi-model statistical measures: using the preprocessor -module ``multi_model_statistics`` will enable the user to ask for either a -multi-model ``mean``, ``median``, ``max``, ``min``, ``std``, and / or -``pXX.YY`` with a set of argument parameters passed to -``multi_model_statistics``. Percentiles can be specified like ``p1.5`` or -``p95``. The decimal point will be replaced by a dash in the output file. - -Note that current multimodel statistics in ESMValTool are local (not global), -and are computed along the time axis. As such, can be computed across a common -overlap in time (by specifying ``span: overlap`` argument) or across the full -length in time of each model (by specifying ``span: full`` argument). +set-up, initial conditions, forcings and implementation; comparing model data to +observational data, these biases have a significantly lower statistical impact +when using a multi-model ensemble. ESMValTool has the capability of computing a +number of multi-model statistical measures: using the preprocessor module +``multi_model_statistics`` will enable the user to ask for either a multi-model +``mean``, ``median``, ``max``, ``min``, ``std``, and / or ``pXX.YY`` with a set +of argument parameters passed to ``multi_model_statistics``. Percentiles can be +specified like ``p1.5`` or ``p95``. The decimal point will be replaced by a dash +in the output file. Restrictive computation is also available by excluding any set of models that the user will not want to include in the statistics (by setting ``exclude: [excluded models list]`` argument). The implementation has a few restrictions -that apply to the input data: model datasets must have consistent shapes, and -from a statistical point of view, this is needed since weights are not yet -implemented; also higher dimensional data is not supported (i.e. anything with -dimensionality higher than four: time, vertical axis, two horizontal axes). +that apply to the input data: model datasets must have consistent shapes, apart +from the time dimension; and cubes with more than four dimensions (time, +vertical axis, two horizontal axes) are not supported. + +Input datasets may have different time coordinates. Statistics can be computed +across overlapping times only (``span: overlap``) or across the full time span +of the combined models (``span: full``). The preprocessor sets a common time +coordinate on all datasets. As the number of days in a year may vary between +calendars, (sub-)daily data with different calendars are not supported. Input datasets may have different time coordinates. The multi-model statistics preprocessor sets a common time coordinate on all datasets. As the number of @@ -681,7 +680,7 @@ days in a year may vary between calendars, (sub-)daily data are not supported. .. code-block:: yaml preprocessors: - multimodel_preprocessor: + multi_model_preprocessor: multi_model_statistics: span: overlap statistics: [mean, median] @@ -702,14 +701,12 @@ entry contains the resulting cube with the requested statistic operations. .. note:: - Note that the multimodel array operations, albeit performed in - per-time/per-horizontal level loops to save memory, could, however, be - rather memory-intensive (since they are not performed lazily as - yet). The Section on :ref:`Memory use` details the memory intake - for different run scenarios, but as a thumb rule, for the multimodel - preprocessor, the expected maximum memory intake could be approximated as - the number of datasets multiplied by the average size in memory for one - dataset. + The multi-model array operations can be rather memory-intensive (since they + are not performed lazily as yet). The Section on :ref:`Memory use` details + the memory intake for different run scenarios, but as a thumb rule, for the + multi-model preprocessor, the expected maximum memory intake could be + approximated as the number of datasets multiplied by the average size in + memory for one dataset. .. _time operations: @@ -1512,14 +1509,14 @@ In the most general case, we can set upper limits on the maximum memory the analysis will require: -``Ms = (R + N) x F_eff - F_eff`` - when no multimodel analysis is performed; +``Ms = (R + N) x F_eff - F_eff`` - when no multi-model analysis is performed; -``Mm = (2R + N) x F_eff - 2F_eff`` - when multimodel analysis is performed; +``Mm = (2R + N) x F_eff - 2F_eff`` - when multi-model analysis is performed; where * ``Ms``: maximum memory for non-multimodel module -* ``Mm``: maximum memory for multimodel module +* ``Mm``: maximum memory for multi-model module * ``R``: computational efficiency of module; `R` is typically 2-3 * ``N``: number of datasets * ``F_eff``: average size of data per dataset where ``F_eff = e x f x F`` @@ -1538,7 +1535,7 @@ where ``Mm = 1.5 x (N - 2)`` GB As a rule of thumb, the maximum required memory at a certain time for -multimodel analysis could be estimated by multiplying the number of datasets by +multi-model analysis could be estimated by multiplying the number of datasets by the average file size of all the datasets; this memory intake is high but also assumes that all data is fully realized in memory; this aspect will gradually change and the amount of realized data will decrease with the increase of diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py index 750768195a..80a930bbf5 100644 --- a/esmvalcore/preprocessor/_multimodel.py +++ b/esmvalcore/preprocessor/_multimodel.py @@ -1,15 +1,4 @@ -"""multimodel statistics. - -Functions for multi-model operations -supports a multitude of multimodel statistics -computations; the only requisite is the ingested -cubes have (TIME-LAT-LON) or (TIME-PLEV-LAT-LON) -dimensions; and obviously consistent units. - -It operates on different (time) spans: -- full: computes stats on full dataset time; -- overlap: computes common time overlap between datasets; -""" +"""Functions to compute multi-cube statistics.""" import logging import re @@ -295,66 +284,51 @@ def _assemble_data(cubes, statistic, span='overlap'): return stats_cube -def multi_model_statistics(products, span, statistics, output_products=None): - """Compute multi-model statistics. +def _multicube_statistics(cubes, statistics, span): + """Compute statistics over multiple cubes. + + Can be used e.g. for ensemble or multi-model statistics. - Multimodel statistics computed along the time axis. Can be - computed across a common overlap in time (set span: overlap) - or across the full length in time of each model (set span: full). - Restrictive computation is also available by excluding any set of - models that the user will not want to include in the statistics - (set exclude: [excluded models list]). + This function was designed to work on (max) four-dimensional data: + time, vertical axis, two horizontal axes. - Restrictions needed by the input data: - - model datasets must have consistent shapes, - - higher dimensional data is not supported (ie dims higher than four: - time, vertical axis, two horizontal axes). + Apart from the time coordinate, cubes must have consistent shapes. There + are two options to combine time coordinates of different lengths, see + the `span` argument. Parameters ---------- - products: list - list of data products or cubes to be used in multimodel stat - computation; - cube attribute of product is the data cube for computing the stats. + cubes: list + list of cubes over which the statistics will be computed; + statistics: list + statistical metrics to be computed. Available options: mean, median, + max, min, std, or pXX.YY (for percentile XX.YY; decimal part optional). span: str overlap or full; if overlap, statitsticss are computed on common time- span; if full, statistics are computed on full time spans, ignoring missing data. - output_products: dict - dictionary of output products. MUST be specified if products are NOT - cubes - statistics: list of str - list of statistical measure(s) to be computed. Available options: - mean, median, max, min, std, or pXX.YY (for percentile XX.YY; decimal - part optional). Returns ------- - set or dict or list - `set` of data products if `output_products` is given - `dict` of cubes if `output_products` is not given - `list` of input cubes if there is no overlap between cubes when - using `span='overlap'` + dict + dictionary of statistics cubes with statistics' names as keys. Raises ------ ValueError If span is neither overlap nor full. """ - logger.debug('Multimodel statistics: computing: %s', statistics) - if len(products) < 2: - logger.info("Single dataset in list: will not compute statistics.") - return products - if output_products: - cubes = [cube for product in products for cube in product.cubes] - statistic_products = set() - else: - cubes = products - statistic_products = {} + if len(cubes) < 2: + logger.info('Found only 1 cube; no statistics computed for %r', + list(cubes)[0]) + return {statistic: cubes[0] for statistic in statistics} + + logger.debug('Multicube statistics: computing: %s', statistics) # Reset time coordinates and make cubes share the same calendar _unify_time_coordinates(cubes) + # Check whether input is valid if span == 'overlap': # check if we have any time overlap times = [cube.coord('time').points for cube in cubes] @@ -362,7 +336,7 @@ def multi_model_statistics(products, span, statistics, output_products=None): if len(overlap) <= 1: logger.info("Time overlap between cubes is none or a single point." "check datasets: will not compute statistics.") - return products + return cubes logger.debug("Using common time overlap between " "datasets to compute statistics.") elif span == 'full': @@ -372,22 +346,95 @@ def multi_model_statistics(products, span, statistics, output_products=None): "Unexpected value for span {}, choose from 'overlap', 'full'". format(span)) + # Compute statistics + statistics_cubes = {} for statistic in statistics: - # Compute statistic statistic_cube = _assemble_data(cubes, statistic, span) + statistics_cubes[statistic] = statistic_cube - if output_products: - # Add to output product and log provenance - statistic_product = output_products[statistic] - statistic_product.cubes = [statistic_cube] - for product in products: - statistic_product.wasderivedfrom(product) - logger.info("Generated %s", statistic_product) - statistic_products.add(statistic_product) - else: - statistic_products[statistic] = statistic_cube + return statistics_cubes + + +def _multiproduct_statistics(products, statistics, output_products, span=None): + """Compute multi-cube statistics on ESMValCore products. + + Extract cubes from products, calculate multicube statistics and + assign the resulting output cubes to the output_products. + """ + cubes = [cube for product in products for cube in product.cubes] + statistics_cubes = _multicube_statistics(cubes=cubes, + statistics=statistics, + span=span) + statistics_products = set() + for statistic, cube in statistics_cubes.items(): + statistics_product = output_products[statistic] + statistics_product.cubes = [cube] + + for product in products: + statistics_product.wasderivedfrom(product) + + logger.info("Generated %s", statistics_product) + statistics_products.add(statistics_product) + + return statistics_products + + +def multi_model_statistics(products, span, statistics, output_products=None): + """Compute multi-model statistics. + + This function computes multi-model statistics on cubes or products. + Products (or: preprocessorfiles) are used internally by ESMValCore to store + workflow and provenance information, and this option should typically be + ignored. - if output_products: - products |= statistic_products - return products - return statistic_products + This function was designed to work on (max) four-dimensional data: time, + vertical axis, two horizontal axes. Apart from the time coordinate, cubes + must have consistent shapes. There are two options to combine time + coordinates of different lengths, see the `span` argument. + + Parameters + ---------- + products: list + Cubes (or products) over which the statistics will be computed. + statistics: list + Statistical metrics to be computed. Available options: mean, median, + max, min, std, or pXX.YY (for percentile XX.YY; decimal part optional). + span: str + Overlap or full; if overlap, statitstics are computed on common time- + span; if full, statistics are computed on full time spans, ignoring + missing data. + output_products: dict + For internal use only. A dict with statistics names as keys and + preprocessorfiles as values. If products are passed as input, the + statistics cubes will be assigned to these output products. + + Returns + ------- + dict + A dictionary of statistics cubes with statistics' names as keys. (If + input type is products, then it will return a set of output_products.) + + Raises + ------ + ValueError + If span is neither overlap nor full, or if input type is neither cubes + nor products. + """ + if all(isinstance(p, iris.cube.Cube) for p in products): + return _multicube_statistics( + cubes=products, + statistics=statistics, + span=span, + ) + if all(type(p).__name__ == 'PreprocessorFile' for p in products): + # Avoid circular input: https://stackoverflow.com/q/16964467 + return _multiproduct_statistics( + products=products, + statistics=statistics, + output_products=output_products, + span=span, + ) + raise ValueError( + "Input type for multi_model_statistics not understood. Expected " + "iris.cube.Cube or esmvalcore.preprocessor.PreprocessorFile, " + "got {}".format(products)) diff --git a/tests/sample_data/multimodel_statistics/test_multimodel.py b/tests/sample_data/multimodel_statistics/test_multimodel.py index a527a9b913..9a094c4986 100644 --- a/tests/sample_data/multimodel_statistics/test_multimodel.py +++ b/tests/sample_data/multimodel_statistics/test_multimodel.py @@ -8,7 +8,8 @@ import numpy as np import pytest -from esmvalcore.preprocessor import extract_time, multi_model_statistics +from esmvalcore.preprocessor import extract_time +from esmvalcore.preprocessor._multimodel import multi_model_statistics esmvaltool_sample_data = pytest.importorskip("esmvaltool_sample_data") @@ -118,11 +119,13 @@ def calendar(cube): return cube_dict -def multimodel_test(cubes, span, statistic): +def multimodel_test(cubes, statistic, span): """Run multimodel test with some simple checks.""" statistics = [statistic] - result = multi_model_statistics(cubes, span=span, statistics=statistics) + result = multi_model_statistics(products=cubes, + statistics=statistics, + span=span) assert isinstance(result, dict) assert statistic in result @@ -139,7 +142,7 @@ def multimodel_regression_test(cubes, span, name): are being written. """ statistic = 'mean' - result = multimodel_test(cubes, span=span, statistic=statistic) + result = multimodel_test(cubes, statistic=statistic, span=span) result_cube = result[statistic] filename = Path(__file__).with_name(f'{name}-{span}-{statistic}.nc') diff --git a/tests/unit/preprocessor/_multimodel/test_multimodel.py b/tests/unit/preprocessor/_multimodel/test_multimodel.py index 42b28a9525..1899e09097 100644 --- a/tests/unit/preprocessor/_multimodel/test_multimodel.py +++ b/tests/unit/preprocessor/_multimodel/test_multimodel.py @@ -8,16 +8,18 @@ import tests from esmvalcore.preprocessor import multi_model_statistics -from esmvalcore.preprocessor._multimodel import (_assemble_data, - _compute_statistic, - _get_time_slice, _plev_fix, - _put_in_cube, - _unify_time_coordinates) +from esmvalcore.preprocessor._multimodel import ( + _assemble_data, + _compute_statistic, + _get_time_slice, + _plev_fix, + _put_in_cube, + _unify_time_coordinates, +) class Test(tests.Test): """Test class for preprocessor/_multimodel.py.""" - def setUp(self): """Prepare tests.""" # Make various time arrays @@ -92,7 +94,9 @@ def test_compute_statistic(self): def test_compute_full_statistic_mon_cube(self): data = [self.cube1, self.cube2] - stats = multi_model_statistics(data, 'full', ['mean']) + stats = multi_model_statistics(products=data, + statistics=['mean'], + span='full') expected_full_mean = np.ma.ones((5, 3, 2, 2)) expected_full_mean.mask = np.ones((5, 3, 2, 2)) expected_full_mean.mask[1] = False @@ -100,7 +104,9 @@ def test_compute_full_statistic_mon_cube(self): def test_compute_full_statistic_yr_cube(self): data = [self.cube4, self.cube5] - stats = multi_model_statistics(data, 'full', ['mean']) + stats = multi_model_statistics(products=data, + statistics=['mean'], + span='full') expected_full_mean = np.ma.ones((4, 3, 2, 2)) expected_full_mean.mask = np.zeros((4, 3, 2, 2)) expected_full_mean.mask[2:4] = True @@ -108,13 +114,17 @@ def test_compute_full_statistic_yr_cube(self): def test_compute_overlap_statistic_mon_cube(self): data = [self.cube1, self.cube1] - stats = multi_model_statistics(data, 'overlap', ['mean']) + stats = multi_model_statistics(products=data, + statistics=['mean'], + span='overlap') expected_ovlap_mean = np.ma.ones((2, 3, 2, 2)) self.assert_array_equal(stats['mean'].data, expected_ovlap_mean) def test_compute_overlap_statistic_yr_cube(self): data = [self.cube4, self.cube4] - stats = multi_model_statistics(data, 'overlap', ['mean']) + stats = multi_model_statistics(products=data, + statistics=['mean'], + span='overlap') expected_ovlap_mean = np.ma.ones((2, 3, 2, 2)) self.assert_array_equal(stats['mean'].data, expected_ovlap_mean)