diff --git a/doc/conf.py b/doc/conf.py
index 06986c70ca..9c42ec7eb8 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -418,7 +418,7 @@
     (f'https://docs.esmvaltool.org/projects/esmvalcore/en/{rtd_version}/',
      None),
     'esmvaltool': (f'https://docs.esmvaltool.org/en/{rtd_version}/', None),
-    'iris': ('https://scitools.org.uk/iris/docs/latest/', None),
+    'iris': ('https://scitools-iris.readthedocs.io/en/latest/', None),
     'matplotlib': ('https://matplotlib.org/', None),
     'numpy': ('https://numpy.org/doc/stable/', None),
     'python': ('https://docs.python.org/3/', None),
diff --git a/doc/recipe/preprocessor.rst b/doc/recipe/preprocessor.rst
index 6a5b72e673..88ba4c9ff7 100644
--- a/doc/recipe/preprocessor.rst
+++ b/doc/recipe/preprocessor.rst
@@ -456,17 +456,17 @@ Missing values masks
 --------------------
 
 Missing (masked) values can be a nuisance especially when dealing with
-multimodel ensembles and having to compute multimodel statistics; different
+multi-model ensembles and having to compute multi-model statistics; different
 numbers of missing data from dataset to dataset may introduce biases and
-artificially assign more weight to the datasets that have less missing
-data. This is handled in ESMValTool via the missing values masks: two types of
-such masks are available, one for the multimodel case and another for the
-single model case.
+artificially assign more weight to the datasets that have less missing data.
+This is handled in ESMValTool via the missing values masks: two types of such
+masks are available, one for the multi-model case and another for the single
+model case.
 
-The multimodel missing values mask (``mask_fillvalues``) is a preprocessor step
+The multi-model missing values mask (``mask_fillvalues``) is a preprocessor step
 that usually comes after all the single-model steps (regridding, area selection
 etc) have been performed; in a nutshell, it combines missing values masks from
-individual models into a multimodel missing values mask; the individual model
+individual models into a multi-model missing values mask; the individual model
 masks are built according to common criteria: the user chooses a time window in
 which missing data points are counted, and if the number of missing data points
 relative to the number of total data points in a window is less than a chosen
@@ -492,11 +492,11 @@ See also :func:`esmvalcore.preprocessor.mask_fillvalues`.
 Common mask for multiple models
 -------------------------------
 
-It is possible to use ``mask_fillvalues`` to create a combined multimodel
-mask (all the masks from all the analyzed models combined into a single
-mask); for that purpose setting the ``threshold_fraction`` to 0 will not
-discard any time windows, essentially keeping the original model masks and
-combining them into a single mask; here is an example:
+It is possible to use ``mask_fillvalues`` to create a combined multi-model mask
+(all the masks from all the analyzed models combined into a single mask); for
+that purpose setting the ``threshold_fraction`` to 0 will not discard any time
+windows, essentially keeping the original model masks and combining them into a
+single mask; here is an example:
 
 .. code-block:: yaml
 
@@ -530,13 +530,12 @@ Horizontal regridding
 
 Regridding is necessary when various datasets are available on a variety of
 `lat-lon` grids and they need to be brought together on a common grid (for
-various statistical operations e.g. multimodel statistics or for e.g. direct
+various statistical operations e.g. multi-model statistics or for e.g. direct
 inter-comparison or comparison with observational datasets). Regridding is
 conceptually a very similar process to interpolation (in fact, the regridder
 engine uses interpolation and extrapolation, with various schemes). The primary
 difference is that interpolation is based on sample data points, while
-regridding is based on the horizontal grid of another cube (the reference
-grid).
+regridding is based on the horizontal grid of another cube (the reference grid).
 
 The underlying regridding mechanism in ESMValTool uses the `cube.regrid()
 <https://scitools.org.uk/iris/docs/latest/iris/iris/cube.html#iris.cube.Cube.regrid>`_
@@ -651,28 +650,28 @@ Multi-model statistics
 ======================
 Computing multi-model statistics is an integral part of model analysis and
 evaluation: individual models display a variety of biases depending on model
-set-up, initial conditions, forcings and implementation; comparing model data
-to observational data, these biases have a significantly lower statistical
-impact when using a multi-model ensemble. ESMValTool has the capability of
-computing a number of multi-model statistical measures: using the preprocessor
-module ``multi_model_statistics`` will enable the user to ask for either a
-multi-model ``mean``, ``median``, ``max``, ``min``, ``std``, and / or
-``pXX.YY`` with a set of argument parameters passed to
-``multi_model_statistics``. Percentiles can be specified like ``p1.5`` or
-``p95``. The decimal point will be replaced by a dash in the output file.
-
-Note that current multimodel statistics in ESMValTool are local (not global),
-and are computed along the time axis. As such, can be computed across a common
-overlap in time (by specifying ``span: overlap`` argument) or across the full
-length in time of each model (by specifying ``span: full`` argument).
+set-up, initial conditions, forcings and implementation; comparing model data to
+observational data, these biases have a significantly lower statistical impact
+when using a multi-model ensemble. ESMValTool has the capability of computing a
+number of multi-model statistical measures: using the preprocessor module
+``multi_model_statistics`` will enable the user to ask for either a multi-model
+``mean``, ``median``, ``max``, ``min``, ``std``, and / or ``pXX.YY`` with a set
+of argument parameters passed to ``multi_model_statistics``. Percentiles can be
+specified like ``p1.5`` or ``p95``. The decimal point will be replaced by a dash
+in the output file.
 
 Restrictive computation is also available by excluding  any set of models that
 the user will not want to include in the statistics (by setting ``exclude:
 [excluded models list]`` argument). The implementation has a few restrictions
-that apply to the input data: model datasets must have consistent shapes, and
-from a statistical point of view, this is needed since weights are not yet
-implemented; also higher dimensional data is not supported (i.e. anything with
-dimensionality higher than four: time, vertical axis, two horizontal axes).
+that apply to the input data: model datasets must have consistent shapes, apart
+from the time dimension; and cubes with more than four dimensions (time,
+vertical axis, two horizontal axes) are not supported.
+
+Input datasets may have different time coordinates. Statistics can be computed
+across overlapping times only (``span: overlap``) or across the full time span
+of the combined models (``span: full``). The preprocessor sets a common time
+coordinate on all datasets. As the number of days in a year may vary between
+calendars, (sub-)daily data with different calendars are not supported.
 
 Input datasets may have different time coordinates. The multi-model statistics
 preprocessor sets a common time coordinate on all datasets. As the number of
@@ -681,7 +680,7 @@ days in a year may vary between calendars, (sub-)daily data are not supported.
 .. code-block:: yaml
 
     preprocessors:
-      multimodel_preprocessor:
+      multi_model_preprocessor:
         multi_model_statistics:
           span: overlap
           statistics: [mean, median]
@@ -702,14 +701,12 @@ entry contains the resulting cube with the requested statistic operations.
 
 .. note::
 
-   Note that the multimodel array operations, albeit performed in
-   per-time/per-horizontal level loops to save memory, could, however, be
-   rather memory-intensive (since they are not performed lazily as
-   yet). The Section on :ref:`Memory use` details the memory intake
-   for different run scenarios, but as a thumb rule, for the multimodel
-   preprocessor, the expected maximum memory intake could be approximated as
-   the number of datasets multiplied by the average size in memory for one
-   dataset.
+   The multi-model array operations can be rather memory-intensive (since they
+   are not performed lazily as yet). The Section on :ref:`Memory use` details
+   the memory intake for different run scenarios, but as a thumb rule, for the
+   multi-model preprocessor, the expected maximum memory intake could be
+   approximated as the number of datasets multiplied by the average size in
+   memory for one dataset.
 
 .. _time operations:
 
@@ -1512,14 +1509,14 @@ In the most general case, we can set upper limits on the maximum memory the
 analysis will require:
 
 
-``Ms = (R + N) x F_eff - F_eff`` - when no multimodel analysis is performed;
+``Ms = (R + N) x F_eff - F_eff`` - when no multi-model analysis is performed;
 
-``Mm = (2R + N) x F_eff - 2F_eff`` - when multimodel analysis is performed;
+``Mm = (2R + N) x F_eff - 2F_eff`` - when multi-model analysis is performed;
 
 where
 
 * ``Ms``: maximum memory for non-multimodel module
-* ``Mm``: maximum memory for multimodel module
+* ``Mm``: maximum memory for multi-model module
 * ``R``: computational efficiency of module; `R` is typically 2-3
 * ``N``: number of datasets
 * ``F_eff``: average size of data per dataset where ``F_eff = e x f x F``
@@ -1538,7 +1535,7 @@ where
 ``Mm = 1.5 x (N - 2)`` GB
 
 As a rule of thumb, the maximum required memory at a certain time for
-multimodel analysis could be estimated by multiplying the number of datasets by
+multi-model analysis could be estimated by multiplying the number of datasets by
 the average file size of all the datasets; this memory intake is high but also
 assumes that all data is fully realized in memory; this aspect will gradually
 change and the amount of realized data will decrease with the increase of
diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py
index 750768195a..80a930bbf5 100644
--- a/esmvalcore/preprocessor/_multimodel.py
+++ b/esmvalcore/preprocessor/_multimodel.py
@@ -1,15 +1,4 @@
-"""multimodel statistics.
-
-Functions for multi-model operations
-supports a multitude of multimodel statistics
-computations; the only requisite is the ingested
-cubes have (TIME-LAT-LON) or (TIME-PLEV-LAT-LON)
-dimensions; and obviously consistent units.
-
-It operates on different (time) spans:
-- full: computes stats on full dataset time;
-- overlap: computes common time overlap between datasets;
-"""
+"""Functions to compute multi-cube statistics."""
 
 import logging
 import re
@@ -295,66 +284,51 @@ def _assemble_data(cubes, statistic, span='overlap'):
     return stats_cube
 
 
-def multi_model_statistics(products, span, statistics, output_products=None):
-    """Compute multi-model statistics.
+def _multicube_statistics(cubes, statistics, span):
+    """Compute statistics over multiple cubes.
+
+    Can be used e.g. for ensemble or multi-model statistics.
 
-    Multimodel statistics computed along the time axis. Can be
-    computed across a common overlap in time (set span: overlap)
-    or across the full length in time of each model (set span: full).
-    Restrictive computation is also available by excluding any set of
-    models that the user will not want to include in the statistics
-    (set exclude: [excluded models list]).
+    This function was designed to work on (max) four-dimensional data:
+    time, vertical axis, two horizontal axes.
 
-    Restrictions needed by the input data:
-    - model datasets must have consistent shapes,
-    - higher dimensional data is not supported (ie dims higher than four:
-    time, vertical axis, two horizontal axes).
+    Apart from the time coordinate, cubes must have consistent shapes. There
+    are two options to combine time coordinates of different lengths, see
+    the `span` argument.
 
     Parameters
     ----------
-    products: list
-        list of data products or cubes to be used in multimodel stat
-        computation;
-        cube attribute of product is the data cube for computing the stats.
+    cubes: list
+        list of cubes over which the statistics will be computed;
+    statistics: list
+        statistical metrics to be computed. Available options: mean, median,
+        max, min, std, or pXX.YY (for percentile XX.YY; decimal part optional).
     span: str
         overlap or full; if overlap, statitsticss are computed on common time-
         span; if full, statistics are computed on full time spans, ignoring
         missing data.
-    output_products: dict
-        dictionary of output products. MUST be specified if products are NOT
-        cubes
-    statistics: list of str
-        list of statistical measure(s) to be computed. Available options:
-        mean, median, max, min, std, or pXX.YY (for percentile XX.YY; decimal
-        part optional).
 
     Returns
     -------
-    set or dict or list
-        `set` of data products if `output_products` is given
-        `dict` of cubes if `output_products` is not given
-        `list` of input cubes if there is no overlap between cubes when
-        using `span='overlap'`
+    dict
+        dictionary of statistics cubes with statistics' names as keys.
 
     Raises
     ------
     ValueError
         If span is neither overlap nor full.
     """
-    logger.debug('Multimodel statistics: computing: %s', statistics)
-    if len(products) < 2:
-        logger.info("Single dataset in list: will not compute statistics.")
-        return products
-    if output_products:
-        cubes = [cube for product in products for cube in product.cubes]
-        statistic_products = set()
-    else:
-        cubes = products
-        statistic_products = {}
+    if len(cubes) < 2:
+        logger.info('Found only 1 cube; no statistics computed for %r',
+                    list(cubes)[0])
+        return {statistic: cubes[0] for statistic in statistics}
+
+    logger.debug('Multicube statistics: computing: %s', statistics)
 
     # Reset time coordinates and make cubes share the same calendar
     _unify_time_coordinates(cubes)
 
+    # Check whether input is valid
     if span == 'overlap':
         # check if we have any time overlap
         times = [cube.coord('time').points for cube in cubes]
@@ -362,7 +336,7 @@ def multi_model_statistics(products, span, statistics, output_products=None):
         if len(overlap) <= 1:
             logger.info("Time overlap between cubes is none or a single point."
                         "check datasets: will not compute statistics.")
-            return products
+            return cubes
         logger.debug("Using common time overlap between "
                      "datasets to compute statistics.")
     elif span == 'full':
@@ -372,22 +346,95 @@ def multi_model_statistics(products, span, statistics, output_products=None):
             "Unexpected value for span {}, choose from 'overlap', 'full'".
             format(span))
 
+    # Compute statistics
+    statistics_cubes = {}
     for statistic in statistics:
-        # Compute statistic
         statistic_cube = _assemble_data(cubes, statistic, span)
+        statistics_cubes[statistic] = statistic_cube
 
-        if output_products:
-            # Add to output product and log provenance
-            statistic_product = output_products[statistic]
-            statistic_product.cubes = [statistic_cube]
-            for product in products:
-                statistic_product.wasderivedfrom(product)
-            logger.info("Generated %s", statistic_product)
-            statistic_products.add(statistic_product)
-        else:
-            statistic_products[statistic] = statistic_cube
+    return statistics_cubes
+
+
+def _multiproduct_statistics(products, statistics, output_products, span=None):
+    """Compute multi-cube statistics on ESMValCore products.
+
+    Extract cubes from products, calculate multicube statistics and
+    assign the resulting output cubes to the output_products.
+    """
+    cubes = [cube for product in products for cube in product.cubes]
+    statistics_cubes = _multicube_statistics(cubes=cubes,
+                                             statistics=statistics,
+                                             span=span)
+    statistics_products = set()
+    for statistic, cube in statistics_cubes.items():
+        statistics_product = output_products[statistic]
+        statistics_product.cubes = [cube]
+
+        for product in products:
+            statistics_product.wasderivedfrom(product)
+
+        logger.info("Generated %s", statistics_product)
+        statistics_products.add(statistics_product)
+
+    return statistics_products
+
+
+def multi_model_statistics(products, span, statistics, output_products=None):
+    """Compute multi-model statistics.
+
+    This function computes multi-model statistics on cubes or products.
+    Products (or: preprocessorfiles) are used internally by ESMValCore to store
+    workflow and provenance information, and this option should typically be
+    ignored.
 
-    if output_products:
-        products |= statistic_products
-        return products
-    return statistic_products
+    This function was designed to work on (max) four-dimensional data: time,
+    vertical axis, two horizontal axes. Apart from the time coordinate, cubes
+    must have consistent shapes. There are two options to combine time
+    coordinates of different lengths, see the `span` argument.
+
+    Parameters
+    ----------
+    products: list
+        Cubes (or products) over which the statistics will be computed.
+    statistics: list
+        Statistical metrics to be computed. Available options: mean, median,
+        max, min, std, or pXX.YY (for percentile XX.YY; decimal part optional).
+    span: str
+        Overlap or full; if overlap, statitstics are computed on common time-
+        span; if full, statistics are computed on full time spans, ignoring
+        missing data.
+    output_products: dict
+        For internal use only. A dict with statistics names as keys and
+        preprocessorfiles as values. If products are passed as input, the
+        statistics cubes will be assigned to these output products.
+
+    Returns
+    -------
+    dict
+        A dictionary of statistics cubes with statistics' names as keys. (If
+        input type is products, then it will return a set of output_products.)
+
+    Raises
+    ------
+    ValueError
+        If span is neither overlap nor full, or if input type is neither cubes
+        nor products.
+    """
+    if all(isinstance(p, iris.cube.Cube) for p in products):
+        return _multicube_statistics(
+            cubes=products,
+            statistics=statistics,
+            span=span,
+        )
+    if all(type(p).__name__ == 'PreprocessorFile' for p in products):
+        # Avoid circular input: https://stackoverflow.com/q/16964467
+        return _multiproduct_statistics(
+            products=products,
+            statistics=statistics,
+            output_products=output_products,
+            span=span,
+        )
+    raise ValueError(
+        "Input type for multi_model_statistics not understood. Expected "
+        "iris.cube.Cube or esmvalcore.preprocessor.PreprocessorFile, "
+        "got {}".format(products))
diff --git a/tests/sample_data/multimodel_statistics/test_multimodel.py b/tests/sample_data/multimodel_statistics/test_multimodel.py
index a527a9b913..9a094c4986 100644
--- a/tests/sample_data/multimodel_statistics/test_multimodel.py
+++ b/tests/sample_data/multimodel_statistics/test_multimodel.py
@@ -8,7 +8,8 @@
 import numpy as np
 import pytest
 
-from esmvalcore.preprocessor import extract_time, multi_model_statistics
+from esmvalcore.preprocessor import extract_time
+from esmvalcore.preprocessor._multimodel import multi_model_statistics
 
 esmvaltool_sample_data = pytest.importorskip("esmvaltool_sample_data")
 
@@ -118,11 +119,13 @@ def calendar(cube):
     return cube_dict
 
 
-def multimodel_test(cubes, span, statistic):
+def multimodel_test(cubes, statistic, span):
     """Run multimodel test with some simple checks."""
     statistics = [statistic]
 
-    result = multi_model_statistics(cubes, span=span, statistics=statistics)
+    result = multi_model_statistics(products=cubes,
+                                    statistics=statistics,
+                                    span=span)
     assert isinstance(result, dict)
     assert statistic in result
 
@@ -139,7 +142,7 @@ def multimodel_regression_test(cubes, span, name):
     are being written.
     """
     statistic = 'mean'
-    result = multimodel_test(cubes, span=span, statistic=statistic)
+    result = multimodel_test(cubes, statistic=statistic, span=span)
     result_cube = result[statistic]
 
     filename = Path(__file__).with_name(f'{name}-{span}-{statistic}.nc')
diff --git a/tests/unit/preprocessor/_multimodel/test_multimodel.py b/tests/unit/preprocessor/_multimodel/test_multimodel.py
index 42b28a9525..1899e09097 100644
--- a/tests/unit/preprocessor/_multimodel/test_multimodel.py
+++ b/tests/unit/preprocessor/_multimodel/test_multimodel.py
@@ -8,16 +8,18 @@
 
 import tests
 from esmvalcore.preprocessor import multi_model_statistics
-from esmvalcore.preprocessor._multimodel import (_assemble_data,
-                                                 _compute_statistic,
-                                                 _get_time_slice, _plev_fix,
-                                                 _put_in_cube,
-                                                 _unify_time_coordinates)
+from esmvalcore.preprocessor._multimodel import (
+    _assemble_data,
+    _compute_statistic,
+    _get_time_slice,
+    _plev_fix,
+    _put_in_cube,
+    _unify_time_coordinates,
+)
 
 
 class Test(tests.Test):
     """Test class for preprocessor/_multimodel.py."""
-
     def setUp(self):
         """Prepare tests."""
         # Make various time arrays
@@ -92,7 +94,9 @@ def test_compute_statistic(self):
 
     def test_compute_full_statistic_mon_cube(self):
         data = [self.cube1, self.cube2]
-        stats = multi_model_statistics(data, 'full', ['mean'])
+        stats = multi_model_statistics(products=data,
+                                       statistics=['mean'],
+                                       span='full')
         expected_full_mean = np.ma.ones((5, 3, 2, 2))
         expected_full_mean.mask = np.ones((5, 3, 2, 2))
         expected_full_mean.mask[1] = False
@@ -100,7 +104,9 @@ def test_compute_full_statistic_mon_cube(self):
 
     def test_compute_full_statistic_yr_cube(self):
         data = [self.cube4, self.cube5]
-        stats = multi_model_statistics(data, 'full', ['mean'])
+        stats = multi_model_statistics(products=data,
+                                       statistics=['mean'],
+                                       span='full')
         expected_full_mean = np.ma.ones((4, 3, 2, 2))
         expected_full_mean.mask = np.zeros((4, 3, 2, 2))
         expected_full_mean.mask[2:4] = True
@@ -108,13 +114,17 @@ def test_compute_full_statistic_yr_cube(self):
 
     def test_compute_overlap_statistic_mon_cube(self):
         data = [self.cube1, self.cube1]
-        stats = multi_model_statistics(data, 'overlap', ['mean'])
+        stats = multi_model_statistics(products=data,
+                                       statistics=['mean'],
+                                       span='overlap')
         expected_ovlap_mean = np.ma.ones((2, 3, 2, 2))
         self.assert_array_equal(stats['mean'].data, expected_ovlap_mean)
 
     def test_compute_overlap_statistic_yr_cube(self):
         data = [self.cube4, self.cube4]
-        stats = multi_model_statistics(data, 'overlap', ['mean'])
+        stats = multi_model_statistics(products=data,
+                                       statistics=['mean'],
+                                       span='overlap')
         expected_ovlap_mean = np.ma.ones((2, 3, 2, 2))
         self.assert_array_equal(stats['mean'].data, expected_ovlap_mean)