diff --git a/cf/__init__.py b/cf/__init__.py index 8450c86f4d..99fbb6cd22 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -78,7 +78,7 @@ __date__ = "2022-01-18" __version__ = "4.0.0b0" -_requires = ("numpy", "netCDF4", "cftime", "cfunits", "cfdm", "psutil") +_requires = ("numpy", "netCDF4", "cftime", "cfunits", "cfdm", "psutil", "dask") x = ", ".join(_requires) _error0 = f"cf v{ __version__} requires the modules {x}. " @@ -104,10 +104,6 @@ _found_ESMF = bool(importlib.util.find_spec("ESMF")) -# TODODASK - Remove the next 2 lines when the move to dask is complete -mpi_on = False -mpi_size = 1 - try: import netCDF4 except ImportError as error1: @@ -133,6 +129,11 @@ except ImportError as error1: raise ImportError(_error0 + str(error1)) +try: + import dask +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + # Check the version of psutil _minimum_vn = "0.6.0" if LooseVersion(psutil.__version__) < LooseVersion(_minimum_vn): @@ -183,6 +184,14 @@ f"Got {_cfdm_version} at {cfdm.__file__}" ) +# Check the version of dask +_minimum_vn = "2022.03.0" +if LooseVersion(dask.__version__) < LooseVersion(_minimum_vn): + raise RuntimeError( + f"Bad dask version: cf requires dask>={_minimum_vn}. " + f"Got {dask.__version__} at {dask.__file__}" + ) + from .constructs import Constructs from .mixin import Coordinate diff --git a/cf/data/collapse.py b/cf/data/collapse.py new file mode 100644 index 0000000000..5b938f99a1 --- /dev/null +++ b/cf/data/collapse.py @@ -0,0 +1,2125 @@ +"""Functions used during `Data` object collapses.""" +import inspect +from functools import partial, reduce +from operator import mul + +import numpy as np +from cfdm.core import DocstringRewriteMeta +from dask.array import chunk +from dask.array.core import _concatenate2 +from dask.array.reductions import divide, numel, reduction +from dask.core import flatten +from dask.utils import deepmap + +from ..docstring import _docstring_substitution_definitions + + +class Collapse(metaclass=DocstringRewriteMeta): + """Container for functions that collapse dask arrays. + + .. versionadded:: TODODASK + + """ + + def __docstring_substitutions__(self): + """Define docstring substitutions that apply to this class and + all of its subclasses. + + These are in addtion to, and take precendence over, docstring + substitutions defined by the base classes of this class. + + See `_docstring_substitutions` for details. + + .. versionadded:: TODODASK + + .. seealso:: `_docstring_substitutions` + + :Returns: + + `dict` + The docstring substitutions that have been applied. + + """ + return _docstring_substitution_definitions + + def __docstring_package_depth__(self): + """Return the package depth for "package" docstring + substitutions. + + See `_docstring_package_depth` for details. + + .. versionadded:: TODODASK + + """ + return 0 + + @classmethod + def max(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return maximum values of an array. + + Calculates the maximum value of an array or the maximum values + along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = a.dtype + return reduction( + a, + cf_max_chunk, + partial(cf_max_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_max_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @classmethod + def max_abs( + cls, a, axis=None, keepdims=False, mtol=None, split_every=None + ): + """Return maximum absolute values of an array. + + Calculates the maximum absolute value of an array or the + maximum absolute values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + return cls.max( + abs(a), + axis=axis, + keepdims=keepdims, + mtol=mtol, + split_every=split_every, + ) + + @classmethod + def mean( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return mean values of an array. + + Calculates the mean value of an array or the mean values along + axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = "f8" + return reduction( + a, + cf_mean_chunk, + partial(cf_mean_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_mean_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @classmethod + def mean_abs( + cls, + a, + weights=None, + axis=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return mean absolute values of an array. + + Calculates the mean absolute value of an array or the mean + absolute values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + return cls.mean( + abs(a), + weights=weights, + axis=axis, + keepdims=keepdims, + mtol=mtol, + split_every=split_every, + ) + + @classmethod + def mid_range( + cls, + a, + axis=None, + dtype=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return mid-range values of an array. + + Calculates the mid-range value of an array or the mid-range + values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a, allowed="fi") + dtype = "f8" + return reduction( + a, + cf_range_chunk, + partial(cf_mid_range_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_range_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @classmethod + def min(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return minimum values of an array. + + Calculates the minimum value of an array or the minimum values + along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = a.dtype + return reduction( + a, + cf_min_chunk, + partial(cf_min_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_min_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @classmethod + def min_abs( + cls, a, axis=None, keepdims=False, mtol=None, split_every=None + ): + """Return minimum absolute values of an array. + + Calculates the minimum absolute value of an array or the + minimum absolute values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + return cls.min( + abs(a), + axis=axis, + keepdims=keepdims, + mtol=mtol, + split_every=split_every, + ) + + @classmethod + def range(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return range values of an array. + + Calculates the range value of an array or the range values + along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a, allowed="fi") + dtype = a.dtype + return reduction( + a, + cf_range_chunk, + partial(cf_range_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_range_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @classmethod + def rms( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return root mean square (RMS) values of an array. + + Calculates the RMS value of an array or the RMS values along + axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = "f8" + return reduction( + a, + cf_rms_chunk, + partial(cf_rms_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_mean_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @classmethod + def sample_size( + cls, a, axis=None, keepdims=False, mtol=None, split_every=None + ): + """Return sample size values of an array. + + Calculates the sample size value of an array or the sample + size values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = "i8" + return reduction( + a, + cf_sample_size_chunk, + partial(cf_sample_size_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sample_size_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @classmethod + def sum( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return sum values of an array. + + Calculates the sum value of an array or the sum values along + axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = double_precision_dtype(a) + if weights is not None: + dtype = np.result_type(double_precision_dtype(weights), dtype) + + return reduction( + a, + cf_sum_chunk, + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @classmethod + def sum_of_weights( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return sum of weights values for an array. + + Calculates the sum of weights value for an array or the sum of + weights values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = double_precision_dtype(weights, default="i8") + return reduction( + a, + cf_sum_of_weights_chunk, + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @classmethod + def sum_of_weights2( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return sum of squares of weights values for an array. + + Calculates the sum of squares of weights value for an array or + the sum of squares of weights values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = double_precision_dtype(weights, default="i8") + return reduction( + a, + partial(cf_sum_of_weights_chunk, square=True), + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @classmethod + def var( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + ddof=None, + split_every=None, + ): + """Return variances of an array. + + Calculates the variance value of an array or the variance + values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{ddof: number}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = "f8" + return reduction( + a, + partial(cf_var_chunk, ddof=ddof), + partial(cf_var_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_var_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + +def check_input_dtype(a, allowed="fib"): + """Check that data has a data type allowed by a collapse method. + + The collapse method is assumed to be defined by the calling + function. + + :Parameters: + + a: `dask.array.Array` + The data. + + allowed: `str`, optional + The data type kinds allowed by the collapse + method. Defaults to ``'fib'``, meaning that only float, + integer and Boolean data types are allowed. + + :Returns: + + `None` + + """ + if a.dtype.kind not in allowed: + method = inspect.currentframe().f_back.f_code.co_name + raise TypeError(f"Can't calculate {method} of data with {a.dtype!r}") + + +def double_precision_dtype(a, default=None, bool_type="i"): + """Returns the corresponding double precision data type of an array. + + :Parameters: + + a: `dask.array.Array` or `None` + The data. If `None` then the value of *default* is + returned*. + + default: `str`, optional + If *a* is `None`, then return this data type. + + bool_type: `str`, optional + The corresponding double data type kind for Boolean + data. Defaults to ``'i'``, meaning ``'i8'`` is + returned. Set to ``'f'` to return ``'f8'`` instead. + + :Returns: + + `str` + The double precision type. + + **Examples** + + >>> for d in (int, 'int32', float, 'float32', bool): + ... print(double_precision_dtype(np.array(1, dtype=d))) + ... + i8 + i8 + f8 + f8 + i8 + + >>> double_precision_dtype(np.array(1, dtype=bool), bool_type='f') + 'f8' + >>> double_precision_dtype(None, default="i8") + 'i8' + + """ + if a is None: + return default + + kind = a.dtype.kind + if kind == "b": + return bool_type + "8" + + if kind in "fi": + return kind + "8" + + raise TypeError(f"Can't collapse data with {a.dtype!r}") + + +def mask_small_sample_size(x, N, axis, mtol, original_shape): + """Mask elements where the sample size is below a threshold. + + .. versionadded:: TODODASK + + :Parameters: + + x: `numpy.ndarray` + The collapsed data. + + N: `numpy.ndarray` + The sample sizes of the collapsed values. + + axis: sequence of `int` + The axes being collapsed. + + mtol: number + The sample size threshold below which collapsed values are + set to missing data. It is defined as a fraction (between + 0 and 1 inclusive) of the contributing input data values. + + The default of *mtol* is 1, meaning that a missing datum + in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the input + data. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + :Returns: + + `numpy.ndarray` + Array *x* masked where *N* is sufficiently small. Note + that the input *x* might be modified in-place with the + contents of the output. + + """ + if not x.ndim: + # Make sure that we have a numpy array (e.g. as opposed to a + # numpy.float64) + x = np.asanyarray(x) + + if mtol < 1: + # Nmax = total number of elements, including missing values + Nmax = reduce(mul, [original_shape[i] for i in axis], 1) + x = np.ma.masked_where(N < (1 - mtol) * Nmax, x, copy=False) + + return x + + +def sum_weights_chunk(x, weights=None, square=False, N=None, **kwargs): + """Sum the weights. + + .. versionadded:: TODODASK + + :Parameters: + + x: `numpy.ndarray` + The data. + + weights: `numpy.ndarray`, optional + The weights associated with values of the data. Must have + the same shape as *x*. By default *weights* is `None`, + meaning that all non-missing elements of the data have a + weight of 1 and all missing elements have a weight of + 0. If given as an array then those weights that are + missing data, or that correspond to the missing elements + of the data, are assigned a weight of 0. + + square: `bool`, optional + If True calculate the sum of the squares of the weights. + + N: `numpy.ndarray`, optional + The sample size. If provided as an array and there are no + weights, then the *N* is returned instead of calculating + the sum (of the squares) of weights. Ignored of *weights* + is not `None`. + + :Returns: + + `numpy.ndarray` + The sum of the weights, with data type "i8" or "f8". + + """ + if weights is None: + # All weights are 1, so the sum of the weights and the sum of + # the squares of the weights are both equal to the sample + # size. + if N is None: + N = cf_sample_size_chunk(x, **kwargs)["N"] + + return N + + dtype = double_precision_dtype(weights) + if square: + weights = np.multiply(weights, weights, dtype=dtype) + + if np.ma.is_masked(x): + weights = np.ma.masked_where(x.mask, weights) + + return chunk.sum(weights, dtype=dtype, **kwargs) + + +def combine_arrays( + pairs, key, func, axis, dtype=None, computing_meta=False, **kwargs +): + """Worker function for Combine callables. + + Select arrays by dictionary key from a nested list of + dictionaries, concatenate the resulting nested list of arrays + along the specified axes, and then apply a function to the result + along those same axes. + + See `dask.array.reductions.mean_combine` for an example. + + .. versionadded:: TODODASK + + :Returns: + + `numpy.ndarray` + + """ + x = deepmap(lambda pair: pair[key], pairs) if not computing_meta else pairs + + if dtype: + kwargs["dtype"] = dtype + + x = _concatenate2(x, axes=axis) + return func(x, axis=axis, **kwargs) + + +def sum_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``func=chunk.sum``. + + .. versionadded:: TODODASK + + """ + return combine_arrays( + pairs, key, chunk.sum, axis, dtype, computing_meta, **kwargs + ) + + +def max_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``func=chunk.max``. + + .. versionadded:: TODODASK + + """ + return combine_arrays( + pairs, key, chunk.max, axis, dtype, computing_meta, **kwargs + ) + + +def min_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``func=chunk.min``. + + .. versionadded:: TODODASK + + """ + return combine_arrays( + pairs, key, chunk.min, axis, dtype, computing_meta, **kwargs + ) + + +def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``key="N", func=chunk.sum, + dtype="i8"``. + + .. versionadded:: TODODASK + + """ + return combine_arrays( + pairs, + "N", + chunk.sum, + axis, + dtype="i8", + computing_meta=computing_meta, + **kwargs, + ) + + +# -------------------------------------------------------------------- +# mean +# -------------------------------------------------------------------- +def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): + """Chunk calculations for the mean. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * V1: The sum of ``weights`` (equal to ``N`` if weights + are not set). + * sum: The weighted sum of ``x``. + * weighted: True if weights have been set. + + """ + if computing_meta: + return x + + # N, sum + d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) + + d["V1"] = sum_weights_chunk(x, weights, N=d["N"], **kwargs) + d["weighted"] = weights is not None + + return d + + +def cf_mean_combine( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + **kwargs, +): + """Combination calculations for the mean. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_mean_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + weighted = next(flatten(pairs))["weighted"] + d = {"weighted": weighted} + + d["sum"] = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d["sum"] + + d["N"] = sum_sample_sizes(pairs, axis, **kwargs) + if weighted: + d["V1"] = sum_arrays(pairs, "V1", axis, dtype, **kwargs) + else: + d["V1"] = d["N"] + + return d + + +def cf_mean_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the mean. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_mean_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = divide(d["sum"], d["V1"], dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# maximum +# -------------------------------------------------------------------- +def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the maximum. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * max: The maximum of `x``. + + """ + if computing_meta: + return x + + return { + "max": chunk.max(x, **kwargs), + "N": cf_sample_size_chunk(x, **kwargs)["N"], + } + + +def cf_max_combine( + pairs, + axis=None, + computing_meta=False, + **kwargs, +): + """Combination calculations for the maximum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_max_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + mx = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) + if computing_meta: + return mx + + return { + "max": mx, + "N": sum_sample_sizes(pairs, axis, **kwargs), + } + + +def cf_max_agg( + pairs, + axis=None, + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the maximum. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_max_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["max"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# mid-range +# -------------------------------------------------------------------- +def cf_mid_range_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the mid-range. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_range_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + # Calculate the mid-range + x = divide(d["max"] + d["min"], 2.0, dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# minimum +# -------------------------------------------------------------------- +def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the minimum. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * min: The minimum of ``x``. + + """ + if computing_meta: + return x + + return { + "min": chunk.min(x, **kwargs), + "N": cf_sample_size_chunk(x, **kwargs)["N"], + } + + +def cf_min_combine( + pairs, + axis=None, + computing_meta=False, + **kwargs, +): + """Combination calculations for the minimum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_min_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + mn = min_arrays(pairs, "min", axis, None, computing_meta, **kwargs) + if computing_meta: + return mn + + return { + "min": mn, + "N": sum_sample_sizes(pairs, axis, **kwargs), + } + + +def cf_min_agg( + pairs, + axis=None, + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the minimum. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_min_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["min"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# range +# -------------------------------------------------------------------- +def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the range. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * min: The minimum of ``x``. + * max: The maximum of ``x`. + + """ + if computing_meta: + return x + + # N, max + d = cf_max_chunk(x, **kwargs) + + d["min"] = chunk.min(x, **kwargs) + return d + + +def cf_range_combine( + pairs, + axis=None, + dtype=None, + computing_meta=False, + **kwargs, +): + """Combination calculations for the range. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_range_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + mx = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) + if computing_meta: + return mx + + mn = min_arrays(pairs, "min", axis, None, **kwargs) + + return { + "max": mx, + "min": mn, + "N": sum_sample_sizes(pairs, axis, **kwargs), + } + + +def cf_range_agg( + pairs, + axis=None, + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the range. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_range_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d + + # Calculate the range + x = d["max"] - d["min"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# root mean square +# -------------------------------------------------------------------- +def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): + """Chunk calculations for the root mean square (RMS). + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The weighted sum of ``x**2``. + + """ + if computing_meta: + return x + + return cf_mean_chunk( + np.multiply(x, x, dtype=dtype), weights=weights, dtype=dtype, **kwargs + ) + + +def cf_rms_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the root mean square (RMS). + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_mean_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = np.sqrt(d["sum"] / d["V1"], dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# sample size +# -------------------------------------------------------------------- +def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): + """Chunk calculations for the sample size. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + """ + if computing_meta: + return x + + if np.ma.isMA(x): + N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) + else: + if dtype: + kwargs["dtype"] = dtype + + N = numel(x, **kwargs) + + return {"N": N} + + +def cf_sample_size_combine( + pairs, + axis=None, + dtype="i8", + computing_meta=False, + **kwargs, +): + """Combination calculations for the sample size. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_sample_size_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + x = sum_arrays(pairs, "N", axis, dtype, computing_meta, **kwargs) + if computing_meta: + return x + + return {"N": x} + + +def cf_sample_size_agg( + pairs, + axis=None, + computing_meta=False, + dtype="i8", + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the sample size. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_sample_size_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["N"] + x = mask_small_sample_size(x, x, axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# sum +# -------------------------------------------------------------------- +def cf_sum_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): + """Chunk calculations for the sum. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The weighted sum of ``x`` + + """ + if computing_meta: + return x + + if weights is not None: + x = np.multiply(x, weights, dtype=dtype) + + d = cf_sample_size_chunk(x, **kwargs) + d["sum"] = chunk.sum(x, dtype=dtype, **kwargs) + return d + + +def cf_sum_combine( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + **kwargs, +): + """Combination calculations for the sum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_sum_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + x = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) + if computing_meta: + return x + + return { + "sum": x, + "N": sum_sample_sizes(pairs, axis, **kwargs), + } + + +def cf_sum_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the sum. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_sum_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["sum"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# sum of weights +# -------------------------------------------------------------------- +def cf_sum_of_weights_chunk( + x, weights=None, dtype="f8", computing_meta=False, square=False, **kwargs +): + """Chunk calculations for the sum of the weights. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + :Parameters: + + square: `bool`, optional + If True then calculate the sum of the squares of the + weights. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The sum of ``weights``, or the sum of + ``weights**2`` if *square* is True. + + """ + if computing_meta: + return x + + # N + d = cf_sample_size_chunk(x, **kwargs) + + d["sum"] = sum_weights_chunk( + x, weights=weights, square=square, N=d["N"], **kwargs + ) + + return d + + +# -------------------------------------------------------------------- +# variance +# -------------------------------------------------------------------- +def cf_var_chunk( + x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs +): + """Chunk calculations for the variance. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + See + https://en.wikipedia.org/wiki/Pooled_variance#Sample-based_statistics + for details. + + .. versionadded:: TODODASK + + :Parameters: + + ddof: number + The delta degrees of freedom. The number of degrees of + freedom used in the calculation is (N-*ddof*) where N + represents the number of non-missing elements. A value of + 1 applies Bessel's correction. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * V1: The sum of ``weights`` (equal to ``N`` if weights + are not set). + * V2: The sum of ``weights**2``, or `None` of not + required. + * sum: The weighted sum of ``x``. + * part: ``V1 * (sigma**2 + mu**2)``, where ``sigma**2`` is + the weighted biased (i.e. ``ddof=0``) variance of + ``x``, and ``mu`` is the weighted mean of ``x``. + * weighted: True if weights have been set. + * ddof: The delta degrees of freedom. + + """ + if computing_meta: + return x + + weighted = weights is not None + + # N, V1, sum + d = cf_mean_chunk(x, weights, dtype=dtype, **kwargs) + + wsum = d["sum"] + V1 = d["V1"] + + avg = divide(wsum, V1, dtype=dtype) + part = x - avg + part *= part + if weighted: + part = part * weights + + part = chunk.sum(part, dtype=dtype, **kwargs) + part = part + avg * wsum + + d["part"] = part + + if weighted and ddof == 1: + d["V2"] = sum_weights_chunk(x, weights, square=True, **kwargs) + else: + d["V2"] = None + + d["weighted"] = weighted + d["ddof"] = ddof + + return d + + +def cf_var_combine( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + **kwargs, +): + """Combination calculations for the variance. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_var_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + d = next(flatten(pairs)) + weighted = d["weighted"] + ddof = d["ddof"] + d = {"weighted": weighted, "ddof": ddof} + + d["part"] = sum_arrays( + pairs, "part", axis, dtype, computing_meta, **kwargs + ) + if computing_meta: + return d["part"] + + d["sum"] = sum_arrays(pairs, "sum", axis, dtype, **kwargs) + + d["N"] = sum_sample_sizes(pairs, axis, **kwargs) + d["V1"] = d["N"] + d["V2"] = None + if weighted: + d["V1"] = sum_arrays(pairs, "V1", axis, dtype, **kwargs) + if ddof == 1: + d["V2"] = sum_arrays(pairs, "V2", axis, dtype, **kwargs) + + return d + + +def cf_var_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the variance. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. note:: Weights are interpreted as reliability weights, as + opposed to frequency weights. + + See + https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights + for details. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_var_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + ddof = d["ddof"] + V1 = d["V1"] + wsum = d["sum"] + var = d["part"] - wsum * wsum / V1 + + # Note: var is now the global value of V1 * sigma**2, where sigma + # is the global weighted biased (i.e. ddof=0) variance. + + if ddof is None: + raise ValueError(f"Must set ddof to a numeric value. Got: {ddof!r}") + + if not ddof: + # Weighted or unweighted variance with ddof=0 + f = 1 / V1 + elif not d["weighted"]: + # Unweighted variance with any non-zero value of ddof + f = 1 / (V1 - ddof) + elif ddof == 1: + # Weighted variance with ddof=1 + f = V1 / (V1 * V1 - d["V2"]) + else: + raise ValueError( + "Can only calculate a weighted variance with ddof=0 or ddof=1. " + f"Got: {ddof!r}" + ) + + # Now get the required global variance with the requested ddof + var = f * var + + var = mask_small_sample_size(var, d["N"], axis, mtol, original_shape) + return var diff --git a/cf/data/collapse_functions.py b/cf/data/collapse_functions.py deleted file mode 100644 index 79017e4ace..0000000000 --- a/cf/data/collapse_functions.py +++ /dev/null @@ -1,1379 +0,0 @@ -from functools import partial as functools_partial - -from numpy import abs as numpy_abs -from numpy import amax as numpy_amax -from numpy import amin as numpy_amin -from numpy import array as numpy_array -from numpy import asanyarray as numpy_asanyarray -from numpy import average as numpy_average -from numpy import bool_ as numpy_bool_ -from numpy import empty as numpy_empty -from numpy import expand_dims as numpy_expand_dims -from numpy import integer as numpy_integer -from numpy import maximum as numpy_maximum -from numpy import minimum as numpy_minimum -from numpy import ndim as numpy_ndim -from numpy import sum as numpy_sum -from numpy.ma import array as numpy_ma_array -from numpy.ma import average as numpy_ma_average -from numpy.ma import expand_dims as numpy_ma_expand_dims -from numpy.ma import isMA as numpy_ma_isMA -from numpy.ma import nomask as numpy_ma_nomask -from numpy.ma import where as numpy_ma_where - -from ..functions import broadcast_array - - -def asanyarray(*args): - """Return every input array as an numpy ndarray, or a subclass of. - - :Parameters: - - args: sequence of array-like input objects - - :Returns: - - `tuple` - The input objects left as, else converted to, `numpy.ndarray` - - """ - out = [] - for x in args: - if x is not None and not numpy_ndim(x): - # Make sure that we have a numpy array (as opposed to, e.g. a - # numpy.float64) - out.append(numpy_asanyarray(x)) - else: - out.append(x) - # --- End: for - - return out - - -def psum(x, y): - """Add two arrays element-wise. - - If either or both of the arrays are masked then the output array is - masked only where both input arrays are masked. - - :Parameters: - - x: numpy array-like - *Might be updated in place*. - - y: numpy array-like - Will not be updated in place. - - :Returns: - - `numpy.ndarray` - - **Examples:** - - >>> c = psum(a, b) - - """ - if numpy_ma_isMA(x): - if numpy_ma_isMA(y): - # x and y are both masked - x_mask = x.mask - x = x.filled(0) - x += y.filled(0) - x = numpy_ma_array(x, mask=x_mask & y.mask, copy=False) - else: - # Only x is masked - x = x.filled(0) - x += y - elif numpy_ma_isMA(y): - # Only y is masked - x += y.filled(0) - else: - # x and y are both unmasked - x += y - - return x - - -def pmax(x, y): - """The element-wise maximum of two arrays. - - :Parameters: - - x: array-like - May be updated in place and should not be used again. - - y: array-like - Will not be updated in place. - - :Returns: - - `numpy.ndarray` - - """ - if numpy_ma_isMA(x): - if numpy_ma_isMA(y): - # x and y are both masked - z = numpy_maximum(x, y) - z = numpy_ma_where(x.mask & ~y.mask, y, z) - x = numpy_ma_where(y.mask & ~x.mask, x, z) - if x.mask is numpy_ma_nomask: # not numpy_any(x.mask): - x = numpy_array(x) - else: - # Only x is masked - z = numpy_maximum(x, y) - x = numpy_ma_where(x.mask, y, z) - if x.mask is numpy_ma_nomask: # not numpy_any(x.mask): - x = numpy_array(x) - elif numpy_ma_isMA(y): - # Only y is masked - z = numpy_maximum(x, y) - x = numpy_ma_where(y.mask, x, z) - if x.mask is numpy_ma_nomask: # not numpy_any(x.mask): - x = numpy_array(x) - else: - # x and y are both unmasked - if not numpy_ndim(x): - # Make sure that we have a numpy array (as opposed to, - # e.g. a numpy.float64) - x = numpy_asanyarray(x) - - numpy_maximum(x, y, out=x) - - return x - - -def pmin(x, y): - """The element-wise minimum of two arrays. - - :Parameters: - - x: `numpy.ndarray` - May be updated in place and should not be used again. - - y: `numpy.ndarray` - Will not be updated in place. - - :Returns: - - `numpy.ndarray` - - """ - if numpy_ma_isMA(x): - if numpy_ma_isMA(y): - # x and y are both masked - z = numpy_minimum(x, y) - z = numpy_ma_where(x.mask & ~y.mask, y, z) - x = numpy_ma_where(y.mask & ~x.mask, x, z) - if x.mask is numpy_ma_nomask: - x = numpy_array(x) - else: - # Only x is masked - z = numpy_minimum(x, y) - x = numpy_ma_where(x.mask, y, z) - if x.mask is numpy_ma_nomask: - x = numpy_array(x) - elif numpy_ma_isMA(y): - # Only y is masked - z = numpy_minimum(x, y) - x = numpy_ma_where(y.mask, x, z) - if x.mask is numpy_ma_nomask: - x = numpy_array(x) - else: - # x and y are both unmasked - if not numpy_ndim(x): - # Make sure that we have a numpy array (as opposed to, - # e.g. a numpy.float64) - x = numpy_asanyarray(x) - - numpy_minimum(x, y, out=x) - - return x - - -def mask_where_too_few_values(Nmin, N, x): - """Mask elements of N and x where N is strictly less than Nmin. - - :Parameters: - - Nmin: `int` - - N: `numpy.ndarray` - - x: `numpy.ndarray` - - :Returns: - - (`numpy.ndarray`, `numpy.ndarray`) - A tuple containing *N* and *x*, both masked where *N* is - strictly less than *Nmin*. - - """ - if N.min() < Nmin: - mask = N < Nmin - N = numpy_ma_array(N, mask=mask, copy=False, shrink=False) - x = numpy_ma_array(x, mask=mask, copy=False, shrink=True) - - return asanyarray(N, x) - - -def double_precision(a): - """Convert the input array to double precision. - - :Parameters: - - a: `numpy.ndarray` - - :Returns: - - `numpy.ndarray` - - """ - char = a.dtype.char - if char == "f": - newtype = float - elif char == "i": - newtype = int - else: - return a - - if numpy_ma_isMA(a): - return a.astype(newtype) - else: - return a.astype(newtype, copy=False) - - -# -------------------------------------------------------------------- -# Maximum -# -------------------------------------------------------------------- -def max_f(a, axis=None, masked=False): - """Return the maximum of an array or maximum along a specified axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: `bool` - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the maximum. - - """ - (N,) = sample_size_f(a, axis=axis, masked=masked) - amax = numpy_amax(a, axis=axis) - - if not numpy_ndim(amax): - # Make sure that we have a numpy array (as opposed to, e.g. a - # numpy.float64) - amax = numpy_asanyarray(amax) - - return asanyarray(N, amax) - - -def max_fpartial(out, out1=None, group=False): - """Return the partial maximum of an array. - - :Parameters: - - out: 2-`tuple` of `numpy.ndarray` - - out1: 2-`tuple` of `numpy.ndarray`, optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the maximum. - - """ - N, amax = out - - if out1 is not None: - N1, amax1 = out1 - N = psum(N, N1) - amax = pmax(amax, amax1) - - return asanyarray(N, amax) - - -def max_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the maximum of an - array. - - Here mask out any values derived from a too-small sample size. - - :Parameters: - - sub_samples: *optional* - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the maximum. - - """ - return mask_where_too_few_values(1, *out) - - -# -------------------------------------------------------------------- -# Minimum -# -------------------------------------------------------------------- -def min_f(a, axis=None, masked=False): - """Return the minimum of an array or minimum along a specified axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: `bool`, optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the minimum. - - """ - (N,) = sample_size_f(a, axis=axis, masked=masked) - amin = numpy_amin(a, axis=axis) - - return asanyarray(N, amin) - - -def min_fpartial(out, out1=None, group=False): - """Return the partial minimum of an array. - - :Parameters: - - out: 2-`tuple` of `numpy.ndarray` - - out1: 2-`tuple` of `numpy.ndarray`, optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the minimum. - - """ - N, amin = out - - if out1 is not None: - N1, amin1 = out1 - N = psum(N, N1) - amin = pmin(amin, amin1) - - return asanyarray(N, amin) - - -def min_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the minimum of an - array. - - Here mask out any values derived from a too-small sample size. - - :Parameters: - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the minimum. - - """ - return mask_where_too_few_values(1, *out) - - -# -------------------------------------------------------------------- -# maximum_absolute_value -# -------------------------------------------------------------------- -def max_abs_f(a, axis=None, masked=False): - """Return the maximum of the absolute array, or the maximum of the - absolute array along an axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: bool - - :Returns: - - 2-tuple of numpy arrays - The sample sizes and the maxima of the absolute values. - - """ - return max_f(numpy_abs(a), axis=axis, masked=masked) - - -max_abs_fpartial = max_fpartial -max_abs_ffinalise = max_ffinalise - - -# -------------------------------------------------------------------- -# minimum_absolute_value -# -------------------------------------------------------------------- -def min_abs_f(a, axis=None, masked=False): - """Return the minimum of the absolute array, or the minimum of the - absolute array along an axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input is - used. - - masked: `bool` - - :Returns: - - 2-tuple of `numpy.ndarray` - The sample sizes and the minima of the absolute values. - - """ - return min_f(numpy_abs(a), axis=axis, masked=masked) - - -min_abs_fpartial = min_fpartial -min_abs_ffinalise = min_ffinalise - - -# -------------------------------------------------------------------- -# mean -# -------------------------------------------------------------------- -def mean_f(a, axis=None, weights=None, masked=False): - """The weighted average along the specified axes. - - :Parameters: - - a: array-like - Input array. Not all missing data - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - masked: `bool`, optional - - :Returns: - - 3-`tuple` of `numpy.ndarray` - The sample size, average and sum of weights inside a 3-tuple. - - """ - a = double_precision(a) - - if masked: - average = numpy_ma_average - else: - average = numpy_average - - avg, sw = average(a, axis=axis, weights=weights, returned=True) - - if not numpy_ndim(avg): - avg = numpy_asanyarray(avg) - sw = numpy_asanyarray(sw) - - if weights is None: - N = sw.copy() - else: - (N,) = sample_size_f(a, axis=axis, masked=masked) - - return asanyarray(N, avg, sw) - - -def mean_fpartial(out, out1=None, group=False): - """Return the partial sample size, the partial sum and partial sum - of the weights. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - Either an output from a previous call to `mean_fpartial`; - or, if *out1* is `None`, an output from `mean_f`. - - out1: 3-`tuple` of `numpy.ndarray`, optional - An output from `mean_f`. - - :Returns: - - 3-`tuple` of `numpy.ndarray` - The sample size, average and sum of weights inside a 3-tuple. - - """ - N, avg, sw = out - - if out1 is None and not group: - # This is the first partition to be processed - - # Convert the partition average to a partition sum - avg *= sw - else: - # Combine this partition with existing parital combination - N1, avg1, sw1 = out1 - - # Convert the partition average to a partition sum - if not group: - avg1 *= sw1 - - N = psum(N, N1) - avg = psum(avg, avg1) # Now a partial sum - sw = psum(sw, sw1) - - return asanyarray(N, avg, sw) - - -def mean_ffinalise(out, sub_samples=None): - """Divide the weighted sum by the sum of weights. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - An output from `mean_fpartial`. - - sub_samples: optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the mean. - - """ - N, avg, sw = out - - if sub_samples: - avg /= sw - - return mask_where_too_few_values(1, N, avg) - - -# -------------------------------------------------------------------- -# mean_absolute_value -# -------------------------------------------------------------------- -def mean_abs_f(a, axis=None, weights=None, masked=False): - """Return the mean of the absolute array, or the means of the - absolute array along an axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input is - used. - - masked: `bool` - - :Returns: - - 2-tuple of `numpy.ndarray` - The sample sizes and the means of the absolute values. - - """ - return mean_f(numpy_abs(a), axis=axis, weights=weights, masked=masked) - - -mean_abs_fpartial = mean_fpartial -mean_abs_ffinalise = mean_ffinalise - - -# -------------------------------------------------------------------- -# root_mean_square -# -------------------------------------------------------------------- -def root_mean_square_f(a, axis=None, weights=None, masked=False): - """The RMS along the specified axes. - - :Parameters: - - a: array-like - Input array. Not all missing data - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - masked: `bool`, optional - - :Returns: - - `tuple` - 3-tuple. - - """ - a = double_precision(a) - - return mean_f(a ** 2, axis=axis, weights=weights, masked=masked) - - -root_mean_square_fpartial = mean_fpartial - - -def root_mean_square_ffinalise(out, sub_samples=None): - """Divide the weighted sum by the sum of weights and take the square - root. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - An output from `root_mean_square_fpartial`. - - sub_samples: optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the RMS. - - """ - N, avg = mean_ffinalise(out, sub_samples=sub_samples) - - avg **= 0.5 - - return asanyarray(N, avg) - - -# -------------------------------------------------------------------- -# Mid-range: Average of maximum and minimum -# -------------------------------------------------------------------- -def mid_range_f(a, axis=None, masked=False): - """Return the minimum and maximum of an array or the minimum and - maximum along an axis. - - ``mid_range_f(a, axis=axis)`` is equivalent to ``(numpy.amin(a, - axis=axis), numpy.amax(a, axis=axis))`` - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - kwargs: ignored - - :Returns: - - 3-`tuple` - The sample size, minimum and maximum inside a 3-tuple. - - """ - (N,) = sample_size_f(a, axis=axis, masked=masked) - amin = numpy_amin(a, axis=axis) - amax = numpy_amax(a, axis=axis) - - if not numpy_ndim(amin): - # Make sure that we have a numpy array (as opposed to, e.g. a - # numpy.float64) - amin = numpy_asanyarray(amin) - amax = numpy_asanyarray(amax) - - return asanyarray(N, amin, amax) - - -def mid_range_fpartial(out, out1=None, group=False): - """Return the partial minimum and partial maximum of an array. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - - out1: 3-`tuple` of `numpy.ndarray`, optional - - group: ignored - - :Returns: - - 3-`tuple` - The sample size, minimum and maximum inside a 3-tuple. - - """ - N, amin, amax = out - - if out1 is not None: - N1, amin1, amax1 = out1 - - N = psum(N, N1) - amin = pmin(amin, amin1) - amax = pmax(amax, amax1) - - return asanyarray(N, amin, amax) - - -def mid_range_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the array mid-range - value. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: ordered sequence - - amin: `numpy.ndarray` - - amax: `numpy.ndarray` - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the mid-range value. - - """ - N, amin, amax = out - - amax = double_precision(amax) - - # Cast bool, unsigned int, and int to float64 - if issubclass(amax.dtype.type, (numpy_integer, numpy_bool_)): - amax = amax.astype(float) - - # Calculate the mid-range value, storing it in the amax variable - amax += amin - amax *= 0.5 - - return mask_where_too_few_values(1, N, amax) - - -# --------------------------------------------------------------------- -# Range: Absolute difference between maximum and minimum -# --------------------------------------------------------------------- -range_f = mid_range_f -range_fpartial = mid_range_fpartial - - -def range_ffinalise(out, sub_samples=None): - """Absolute difference between maximum and minimum. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: ordered sequence - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the range. - - """ - N, amin, amax = out - - # Calculate the range value, storing it in the amax variable - amax = double_precision(amax) - amax -= amin - - return mask_where_too_few_values(1, N, amax) - - -# --------------------------------------------------------------------- -# Sample size -# --------------------------------------------------------------------- -def sample_size_f(a, axis=None, masked=False): - """Return the sample size. - - :Parameters: - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - :Returns: - - `numpy.ndarray` - - """ - if masked: - N = numpy_sum(~a.mask, axis=axis, dtype=float) - if not numpy_ndim(N): - N = numpy_asanyarray(N) - else: - if axis is None: - N = numpy_array(a.size, dtype=float) - else: - shape = a.shape - N = numpy_empty(shape[:axis] + shape[axis + 1 :], dtype=float) - N[...] = shape[axis] - # --- End: if - - return asanyarray(N) - - -def sample_size_fpartial(out, out1=None, group=False): - """Return the partial sample size. - - :Parameters: - - out: ordered sequence of one numpy array - - :Returns: - - `numpy.ndarray` - - """ - (N,) = out - if out1 is not None: - (N1,) = out1 - N = psum(N, N1) - - return asanyarray(N) - - -def sample_size_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to give the sample size. - - :Parameters: - - out: ordered sequence of one numpy array - - sub_samples: optional - Ignored. - - :Returns: - - `tuple` - A 2-tuple containing *N* twice. - - """ - (N,) = out - return asanyarray(N, N) - - -# --------------------------------------------------------------------- -# sum -# --------------------------------------------------------------------- -def sum_f(a, axis=None, weights=None, masked=False): - """Return the sum of an array or the sum along an axis. - - ``sum_f(a, axis=axis)`` is equivalent to ``(numpy.sum(a, - axis=axis),)`` - - :Parameters: - - a: numpy array-like - Input array - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum. - - """ - a = double_precision(a) - - (N,) = sample_size_f(a, axis=axis, masked=masked) - - if weights is not None: - # A weights array has been provided - weights = double_precision(weights) - - if weights.ndim < a.ndim: - weights = broadcast_array(weights, a.shape) - - a = a * weights - - asum = a.sum(axis=axis) - - if not numpy_ndim(asum): - asum = numpy_asanyarray(asum) - - return asanyarray(N, asum) - - -def sum_fpartial(out, out1=None, group=False): - """Return the partial sum of an array. - - :Parameters: - - out: 2-`tuple` of `numpy.ndarray` - - out1: 2-`tuple` of `numpy.ndarray`, optional - - group: *optional* - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum. - - """ - N, asum = out - - if out1 is not None: - N1, asum1 = out1 - N = psum(N, N1) - asum = psum(asum, asum1) - - return asanyarray(N, asum) - - -def sum_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the sum of an array. - - Here mask out any values derived from a too-small sample size. - - :Parameters: - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum. - - """ - return mask_where_too_few_values(1, *out) - - -# --------------------------------------------------------------------- -# sum_of_squares -# --------------------------------------------------------------------- -def sum_of_squares_f(a, axis=None, weights=None, masked=False): - """Return the sum of the square of an array or the sum of squares - along an axis. - - :Parameters: - - a: numpy array-like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - :Returns: - - `tuple` - 2-tuple - - """ - a = double_precision(a) - return sum_f(a ** 2, axis=axis, weights=weights, masked=masked) - - -sum_of_squares_fpartial = sum_fpartial -sum_of_squares_ffinalise = sum_ffinalise - - -# --------------------------------------------------------------------- -# Sum of weights -# --------------------------------------------------------------------- -def sw_f( - a, axis=None, masked=False, weights=None, N=None, sum_of_squares=False -): - """Return the sum of weights for an array. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: `bool`, optional - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - N: `numpy.ndarray` - Sample size - - sum_of_squares: delta degrees of freedom - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum of weights. - - """ - if N is None: - (N,) = sample_size_f(a, axis=axis, masked=masked) - - if weights is not None: - # A weights array has been provided - weights = double_precision(weights) - - if weights.ndim < a.ndim: - weights = broadcast_array(weights, a.shape) - - if masked: - weights = numpy_ma_array(weights, mask=a.mask, copy=False) - - if sum_of_squares: - weights = weights * weights - - sw = weights.sum(axis=axis) - - if not numpy_ndim(sw): - sw = numpy_asanyarray(sw) - else: - # The sum of weights is equal to the sample size (i.e. an - # unweighted sample) - sw = N.copy() - - return asanyarray(N, sw) - - -sw_fpartial = sum_fpartial -sw_ffinalise = sum_ffinalise - -# --------------------------------------------------------------------- -# Sum of squares of weights -# --------------------------------------------------------------------- -sw2_f = functools_partial(sw_f, sum_of_squares=True) -sw2_fpartial = sum_fpartial -sw2_ffinalise = sum_ffinalise - - -# --------------------------------------------------------------------- -# Variance -# --------------------------------------------------------------------- -def var_f(a, axis=None, weights=None, masked=False, ddof=0): - """Return a tuple containing metrics relating to the array variance. - - The tuple is a 7-tuple that contains, in the order given, the - following variables: - - ======== ============================================================ - Variable Description - ======== ============================================================ - N Sample size - - var Sample variance (ddof=0) - - avg Weighted mean - - V1 Sum of weights - - V2 Sum of squares of weights - - ddof Delta degrees of freedom - - weighted Whether or not the sample is weighted - ======== ============================================================ - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - masked: `bool`, optional - - ddof: delta degrees of freedom, optional - - :Returns: - - 7-`tuple` - Tuple containing the value of the statistical metrics described - in the above table, in the given order. - - """ - # Make sure that a is double precision - a = double_precision(a) - - weighted = weights is not None - - # ---------------------------------------------------------------- - # Methods: - # - # http://en.wikipedia.org/wiki/Standard_deviation#Population-based_statistics - # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance - # ---------------------------------------------------------------- - - # Calculate N = number of data points - # Calculate avg = mean of data - # Calculate V1 = sum of weights - N, avg, V1 = mean_f(a, weights=weights, axis=axis, masked=masked) - - # Calculate V2 = sum of squares of weights - if weighted and ddof == 1: - N, V2 = sw2_f(a, axis=axis, masked=masked, weights=weights, N=N) - else: - V2 = None - - if axis is not None and avg.size > 1: - # We collapsed over a single axis and the array has 2 or more - # axes, so add an extra size 1 axis to the mean so that - # broadcasting works when we calculate the variance. - reshape_avg = True - if masked: - expand_dims = numpy_ma_expand_dims - else: - expand_dims = numpy_expand_dims - - avg = expand_dims(avg, axis) - else: - reshape_avg = False - - var = a - avg - var *= var - - if masked: - average = numpy_ma_average - else: - average = numpy_average - - var = average(var, axis=axis, weights=weights) - - if reshape_avg: - shape = avg.shape - avg = avg.reshape(shape[:axis] + shape[axis + 1 :]) - - (N, var, avg, V1, V2) = asanyarray(N, var, avg, V1, V2) - - return (N, var, avg, V1, V2, ddof, weighted) - - -def var_fpartial(out, out1=None, group=False): - """Return a tuple of partial metrics relating to the array variance. - - The tuple is a 7-tuple that contains, in the order given, the - following variables: - - ======== ============================================================ - Variable Description - ======== ============================================================ - N Partial sample size - - var Partial sum of V1*(variance + mean^2) - - avg Unweighted partial sum - - V1 Partial sum of weights - - V2 Partial sum of squares of weights - - ddof Delta degrees of freedom - - weighted Whether or not the population is weighted - ======== ============================================================ - - For further information, see: - https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics - - :Parameters: - - out: 7-`tuple` - - out1: 7-`tuple`, optional - - :Returns: - - 7-`tuple` - Tuple containing the value of the statistical metrics described - in the above table, in the given order. - - """ - (N, var, avg, V1, V2, ddof, weighted) = out - - if out1 is None and not group: - # ------------------------------------------------------------ - # var = V1(var+avg**2) - # avg = V1*avg = unweighted partial sum - # ------------------------------------------------------------ - var += avg * avg - var *= V1 - avg *= V1 - else: - # ------------------------------------------------------------ - # var = var + V1b(varb+avgb**2) - # avg = avg + V1b*avgb - # V1 = V1 + V1b - # V2 = V2 + V2b - # ------------------------------------------------------------ - (Nb, varb, avgb, V1b, V2b, ddof, weighted) = out1 - - N = psum(N, Nb) - - if not group: - varb += avgb * avgb - varb *= V1b - avgb *= V1b - - var = psum(var, varb) - avg = psum(avg, avgb) - V1 = psum(V1, V1b) - - if weighted and ddof == 1: - V2 = psum(V2, V2b) - # --- End: if - - (N, var, avg, V1, V2) = asanyarray(N, var, avg, V1, V2) - - return (N, var, avg, V1, V2, ddof, weighted) - - -def var_ffinalise(out, sub_samples=None): - """Calculate the variance of the array and return it with the sample - size. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: 7-`tuple` - - sub_samples: optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the variance. - - """ - (N, var, avg, V1, V2, ddof, weighted) = out - - N, var = mask_where_too_few_values(max(2, ddof + 1), N, var) - N, V1 = mask_where_too_few_values(max(2, ddof + 1), N, V1) - if V2 is not None: - N, V2 = mask_where_too_few_values(max(2, ddof + 1), N, V2) - - if sub_samples: - # ---------------------------------------------------------------- - # The global biased variance = {[SUM(pV1(pv+pm**2)]/V1} - m**2 - # - # where pV1 = partial sum of weights - # pv = partial biased variance - # pm = partial mean - # V1 = global sum of weights - # m = global mean - # - # Currently: var = SUM(pV1(pv+pm**2) - # avg = V1*m - # - # https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics - # - # For the general case of M non-overlapping data sets, X_{1} - # through X_{M}, and the aggregate data set X=\bigcup_{i}X_{i} - # we have the unweighted mean and variance is: - # - # \mu_{X}={\frac{1}{\sum_{i}{N_{X_{i}}}}}\left(\sum_{i}{N_{X_{i}}\mu_{X_{i}}}\right) - # - # var_{X}={{\frac{1}{\sum_{i}{N_{X_{i}}-ddof}}}\left(\sum_{i}{\left[(N_{X_{i}}-1)\sigma_{X_{i}}^{2}+N_{X_{i}}\mu_{X_{i}}^{2}\right]}-\left[\sum_{i}{N_{X_{i}}}\right]\mu_{X}^{2}\right)} - # - # ---------------------------------------------------------------- - avg /= V1 - avg *= avg - var /= V1 - var -= avg - - # ---------------------------------------------------------------- - # var is now the biased global variance - # ---------------------------------------------------------------- - if not weighted: - if ddof: - # The unweighted variance with N-ddof degrees of freedom is - # [V1/(V1-ddof)]*var. In this case V1 equals the sample size, - # N. ddof=1 provides an unbiased estimator of the variance of - # a hypothetical infinite population. - V1 /= V1 - ddof - var *= V1 - elif ddof == 1: - # Calculate the weighted unbiased variance. The unbiased - # variance weighted with _reliability_ weights is - # [V1**2/(V1**2-V2)]*var. - # - # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance - V1 **= 2 - var *= V1 - V1 -= V2 - var /= V1 - elif ddof: - raise ValueError( - "Can only calculate a weighted variance with a delta degrees " - "of freedom (ddof) of 0 or 1: Got {}".format(ddof) - ) - - return asanyarray(N, var) - - -# --------------------------------------------------------------------- -# standard_deviation -# --------------------------------------------------------------------- -sd_f = var_f -sd_fpartial = var_fpartial - - -def sd_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the standard - deviation. - - :Parameters: - - out: `tuple` - A 2-tuple - - sub_samples: *optional* - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the standard deviation. - - """ - N, sd = var_ffinalise(out, sub_samples) - - sd **= 0.5 - - return asanyarray(N, sd) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 6429fd2afe..73dc7aed8e 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -15,6 +15,8 @@ from ..functions import atol as cf_atol from ..functions import rtol as cf_rtol +# from dask.utils import deepmap # Apply function inside nested lists + def _da_ma_allclose(x, y, masked_equal=True, rtol=None, atol=None): """An effective dask.array.ma.allclose method. @@ -260,16 +262,22 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): original array *a*. mtol: number, optional - Set an upper limit of the amount input data values which - are allowed to be missing data when contributing to - individual output percentile values. It is defined as a - fraction (between 0 and 1 inclusive) of the contributing - input data values. The default is 1, meaning that a - missing datum in the output array only occurs when all of - its contributing input array elements are missing data. A - value of 0 means that a missing datum in the output array - occurs whenever any of its contributing input array - elements are missing data. + The sample size threshold below which collapsed values are + set to missing data. It is defined as a fraction (between + 0 and 1 inclusive) of the contributing input data values. + + The default of *mtol* is 1, meaning that a missing datum + in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the input + data. :Returns: @@ -297,7 +305,7 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): a, axis=axis, keepdims=keepdims ) if n_missing.any(): - mask = np.where(n_missing >= mtol * full_size, True, False) + mask = np.where(n_missing > mtol * full_size, True, False) if q.ndim: mask = np.expand_dims(mask, 0) diff --git a/cf/data/data.py b/cf/data/data.py index 0c51c4e748..254581cd9c 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -17,7 +17,6 @@ from dask.base import is_dask_collection, tokenize from dask.core import flatten from dask.highlevelgraph import HighLevelGraph -from numpy.testing import suppress_warnings as numpy_testing_suppress_warnings from ..cfdatetime import dt as cf_dt from ..constants import masked as cf_masked @@ -35,7 +34,6 @@ abspath, ) from ..functions import atol as cf_atol -from ..functions import broadcast_array from ..functions import chunksize as cf_chunksize from ..functions import default_netCDF_fillvals from ..functions import fm_threshold as cf_fm_threshold @@ -46,53 +44,7 @@ from ..mixin_container import Container from ..units import Units from . import NetCDFArray, UMArray -from .collapse_functions import ( # max_f,; max_ffinalise,; max_fpartial, - max_abs_f, - max_abs_ffinalise, - max_abs_fpartial, - mean_abs_f, - mean_abs_ffinalise, - mean_abs_fpartial, - mean_f, - mean_ffinalise, - mean_fpartial, - mid_range_f, - mid_range_ffinalise, - mid_range_fpartial, - min_abs_f, - min_abs_ffinalise, - min_abs_fpartial, - min_f, - min_ffinalise, - min_fpartial, - range_f, - range_ffinalise, - range_fpartial, - root_mean_square_f, - root_mean_square_ffinalise, - root_mean_square_fpartial, - sample_size_f, - sample_size_ffinalise, - sample_size_fpartial, - sd_f, - sd_ffinalise, - sd_fpartial, - sum_f, - sum_ffinalise, - sum_fpartial, - sum_of_squares_f, - sum_of_squares_ffinalise, - sum_of_squares_fpartial, - sw2_f, - sw2_ffinalise, - sw2_fpartial, - sw_f, - sw_ffinalise, - sw_fpartial, - var_f, - var_ffinalise, - var_fpartial, -) +from .collapse import Collapse from .creation import ( compressed_to_dask, convert_to_builtin_type, @@ -1537,62 +1489,6 @@ def _del_dask(self, default=ValueError(), delete_source=True): return out - def _map_blocks( - self, func, delete_source=True, reset_mask_hardness=True, **kwargs - ): - """Apply a function to the data in-place. - - .. warning:: **This method **does not reset the mask - hardness**. It may be necessary for a call to - `_map_blocks` to be followed by a call to - `_reset_mask_hardness` (or equivalent). - - .. versionadded:: TODODASK - - :Parameters: - - func: - The function to be applied to the data, via - `dask.array.map_blocks`, to each chunk of the dask - array. - - delete_source: `bool`, optional - If False then do not delete a source array, if one - exists, after applying the function. By default a - source array is deleted. - - reset_mask_hardness: `bool`, optional - If False then do not reset the mask hardness after - applying the function. By default the mask hardness is - re-applied, even if the mask hardness has not changed. - - kwargs: optional - Keyword arguments passed to the - `dask.array.map_blocks` method. - - :Returns: - - `dask.array.Array` - The updated dask array. - - **Examples** - - >>> d = cf.Data([1, 2, 3]) - >>> dx = d._map_blocks(lambda x: x / 2) - >>> print(d.array) - [0.5 1. 1.5] - - """ - dx = self.get_dask(copy=False) - dx = dx.map_blocks(func, **kwargs) - self._set_dask( - dx, - delete_source=delete_source, - reset_mask_hardness=reset_mask_hardness, - ) - - return dx - def _reset_mask_hardness(self): """Re-apply the mask hardness to the dask array. @@ -2024,9 +1920,47 @@ def median( squeeze=False, mtol=1, inplace=False, - _preserve_partitions=False, ): - """Compute the median of the values.""" + """Calculate median values. + + Calculates the median value or the median values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `mean_of_upper_decile`, `percentile` + + :Parameters: + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2]) + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.median() + + + """ return self.percentile( 50, axes=axes, @@ -2039,19 +1973,68 @@ def median( def mean_of_upper_decile( self, axes=None, - include_decile=True, - squeeze=False, weights=None, + squeeze=False, mtol=1, + include_decile=True, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Compute the mean the of upper decile. + """Calculate means of the upper deciles. + + Calculates the mean of the upper decile or the mean or the + mean of the upper decile values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `mean`, `median`, `percentile` + + :Parameters: + + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + TODODASK - note that weights only applies to the + calculation of the mean, not the upper + decile. + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + TODODASK - note that mtol only applies to the + calculation of the upper decile, not the + mean. + + include_decile: `bool`, optional + TODODASK + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** - Specifically, calculate the mean of the upper group of data - values defined by the upper tenth of their distribution. + TODODASK """ + + # TODODASK: Some updates off the back of daskifying collapse + # have been done, but still needs looking at. A unit + # test has also been written, but not run. Needs + # __lt__ and __le__. + d = _inplace_enabled_define_and_cleanup(self) p90 = d.percentile( @@ -2060,10 +2043,9 @@ def mean_of_upper_decile( squeeze=False, mtol=mtol, inplace=False, - _preserve_partitions=_preserve_partitions, ) - with numpy_testing_suppress_warnings() as sup: + with np.testing.suppress_warnings() as sup: sup.filter( RuntimeWarning, message=".*invalid value encountered in less.*" ) @@ -2071,7 +2053,6 @@ def mean_of_upper_decile( mask = d < p90 else: mask = d <= p90 - # --- End: with if mtol < 1: mask.filled(False, inplace=True) @@ -2080,10 +2061,11 @@ def mean_of_upper_decile( d.mean( axes=axes, - squeeze=squeeze, weights=weights, + squeeze=squeeze, + mtol=1, + split_every=split_every, inplace=True, - _preserve_partitions=_preserve_partitions, ) return d @@ -2195,23 +2177,11 @@ def percentile( is guaranteed to broadcast correctly against the original data. - mtol: number, optional - Set an upper limit of the amount input data values - which are allowed to be missing data when contributing - to individual output percentile values. It is defined - as a fraction (between 0 and 1 inclusive) of the - contributing input data values. The default is 1, - meaning that a missing datum in the output array only - occurs when all of its contributing input array - elements are missing data. A value of 0 means that a - missing datum in the output array occurs whenever any - of its contributing input array elements are missing - data. + {{mtol: number, optional}} - *Parameter example:* - To ensure that an output array element is a missing - value if more than 25% of its input array elements - are missing data: ``mtol=0.25``. + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -2250,7 +2220,7 @@ def percentile( [[-- -- -- --] [-- -- -- --] [-- 9 10 11]] - >>> e.sd() + >>> e.std() Find the mean of the values above the 45th percentile along the @@ -3457,7 +3427,9 @@ def _asdatetime(self, inplace=False): ) if not d._isdatetime(): - d._map_blocks(cf_rt2dt, units=units, dtype=object) + dx = d.to_dask_array() + dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -3512,7 +3484,9 @@ def _asreftime(self, inplace=False): ) if d._isdatetime(): - d._map_blocks(cf_dt2rt, units=units, dtype=float) + dx = d.to_dask_array() + dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -5076,917 +5050,150 @@ def __pos__(self): """ return self._unary_operation("__pos__") - @_deprecated_kwarg_check("i") - @_inplace_enabled(default=False) - def _collapse( - self, - func, - fpartial, - ffinalise, - axes=None, - squeeze=False, - weights=None, - mtol=1, - units=None, - inplace=False, - i=False, - _preserve_partitions=False, - **kwargs, - ): - """Collapse the data. + # ---------------------------------------------------------------- + # Private attributes + # ---------------------------------------------------------------- + @property + def _Units(self): + """Storage for the units. - :Parameters: + The units are stored in a `Units` object, and reflect the + units of the (yet to be computed) elements of the underlying + data. - func: function + .. warning:: Assigning to `_Units` does *not* trigger a units + conversion of the underlying data + values. Therefore assigning to `_Units` should + only be done in cases when it is known that the + intrinsic units represented by the data values + are inconsistent with the existing value of + `_Units`. Before assigning to `_Units`, first + consider if assigning to `Units`, or calling the + `override_units` or `override_calendar` method is + a more appropriate course of action, and use one + of those if possible. - fpartial: function + """ + return self._custom["_Units"] - ffinalise: function + @_Units.setter + def _Units(self, value): + self._custom["_Units"] = value - axes: (sequence of) `int`, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + @_Units.deleter + def _Units(self): + self._custom["_Units"] = _units_None - squeeze: `bool`, optional - If False then the axes which are collapsed are left in the - result as axes with size 1. In this case the result will - broadcast correctly against the original array. By default - collapsed axes are removed. + @property + def _cyclic(self): + """Storage for axis cyclicity. - weights: *optional* + Contains a `set` that identifies which axes are cyclic (and + therefore allow cyclic slicing). The set contains a subset of + the axis identifiers defined by the `_axes` attribute. - {{inplace: `bool`, optional}} + .. warning:: Never change the value of the `_cyclic` attribute + in-place. - {{i: deprecated at version 3.0.0}} + .. note:: When an axis identifier is removed from the `_axes` + attribute then it is automatically also removed from + the `_cyclic` attribute. - _preserve_partitions: `bool`, optional - If True then preserve the shape of the partition matrix of - the input data, at the expense of a much slower - execution. By default, the partition matrix may be reduced - (using `varray`) to considerably speed things up. + """ + return self._custom["_cyclic"] - kwargs: *optional* + @_cyclic.setter + def _cyclic(self, value): + self._custom["_cyclic"] = value - :Returns: + @_cyclic.deleter + def _cyclic(self): + self._custom["_cyclic"] = _empty_set - `Data` or `None` + @property + @daskified(_DASKIFIED_VERBOSE) + def _hardmask(self): + """Storage for the mask hardness. - """ - d = _inplace_enabled_define_and_cleanup(self) - ndim = d._ndim - self_axes = d._axes - self_shape = d._shape + Contains a `bool`, where `True` denotes a hard mask and + `False` denotes a soft mask. - original_self_axes = self_axes[:] + .. warning:: Assigning to `_hardmask` does *not* trigger a + hardening or softening of the mask of the + underlying data values. Therefore assigning to + `_hardmask` should only be done in cases when it + is known that the intrinsic mask hardness of the + data values is inconsistent with the + existing value of `_hardmask`. Before assigning + to `_hardmask`, first consider if assigning to + `hardmask`, or calling the `harden_mask` or + `soften_mask` method is a more appropriate course + of action, and use one of those if possible. - if axes is None: - # Collapse all axes - axes = list(range(ndim)) - n_collapse_axes = ndim - n_non_collapse_axes = 0 - Nmax = d._size - elif not axes and axes != 0: - # Collapse no axes - if inplace: - d = None - return d - else: - # Collapse some (maybe all) axes - axes = sorted(d._parse_axes(axes)) # , '_collapse')) - n_collapse_axes = len(axes) - n_non_collapse_axes = ndim - n_collapse_axes - Nmax = 1 - for i in axes: - Nmax *= self_shape[i] - # --- End: if + See `hardmask` for details. - # ------------------------------------------------------------- - # Parse the weights. - # - # * Change the keys from dimension names to the integer - # positions of the dimensions. - # - # * Make sure all non-null weights are Data objects. - # ------------------------------------------------------------ - if weights is not None: - if not isinstance(weights, dict): - # If the shape of the weights is not the same as the - # shape of the data array then the weights are assumed - # to span the collapse axes in the order in which they - # are given - if np.shape(weights) == self_shape: - weights = {tuple(self_axes): weights} - else: - weights = {tuple([self_axes[i] for i in axes]): weights} + """ + return self._custom["_hardmask"] - else: - weights = weights.copy() - weights_axes = set() - for key, value in tuple(weights.items()): - del weights[key] - key = d._parse_axes(key) - if weights_axes.intersection(key): - raise ValueError("Duplicate weights axis") - - weights_axes.update(key) - weights[tuple([self_axes[i] for i in key])] = value - # --- End: for + @_hardmask.setter + def _hardmask(self, value): + self._custom["_hardmask"] = value - if not weights_axes.intersection(axes): - # Ignore all of the weights if none of them span - # any collapse axes - weights = {} - # --- End: if + @property + @daskified(_DASKIFIED_VERBOSE) + def _axes(self): + """Storage for the axis identifiers. - for key, weight in tuple(weights.items()): - if weight is None or np.size(weight) == 1: - # Ignore undefined weights and size 1 weights - del weights[key] - continue + Contains a `tuple` of identifiers, one for each array axis. - weight_ndim = np.ndim(weight) - if weight_ndim != len(key): - raise ValueError( - "Can't collapse: Incorrect number of weights " - "axes (%d != %d)" % (weight.ndim, len(key)) - ) + .. note:: When the axis identifiers are reset, then any axis + identifier named by the `_cyclic` attribute which is + not in the new `_axes` set is automatically removed + from the `_cyclic` attribute. - if weight_ndim > ndim: - raise ValueError( - "Can't collapse: Incorrect number of weights " - "axes (%d > %d)" % (weight.ndim, ndim) - ) + """ + return self._custom["_axes"] - for n, axis in zip(np.shape(weight), key): - if n != self_shape[self_axes.index(axis)]: - raise ValueError( - "Can't collapse: Incorrect weights " - "shape {!r}".format(np.shape(weight)) - ) - # --- End: for + @_axes.setter + def _axes(self, value): + self._custom["_axes"] = tuple(value) - # Convert weight to a data object, if necessary. - weight = type(self).asdata(weight) + # Remove cyclic axes that are not in the new axes + cyclic = self._cyclic + if cyclic: + # Never change the value of the _cyclic attribute in-place + self._cyclic = cyclic.intersection(value) - if weight.dtype.char in ("S", "U"): - # Ignore string-valued weights - del weights[key] - continue + # ---------------------------------------------------------------- + # Dask attributes + # ---------------------------------------------------------------- + @property + def chunks(self): + """TODODASK.""" + return self._get_dask().chunks - weights[key] = weight - # --- End: for - # --- End: if + @property + def force_compute(self): + """TODODASK See also config settings.""" + return self._custom.get("force_compute", False) - if axes != list(range(n_non_collapse_axes, ndim)): - transpose_iaxes = [i for i in range(ndim) if i not in axes] + axes - d.transpose(transpose_iaxes, inplace=True) + @force_compute.setter + def force_compute(self, value): + self._custom["force_compute"] = bool(value) - if weights: - # Optimize when weights span only non-partitioned axes - # (do this before permuting the order of the weight - # axes to be consistent with the order of the data - # axes) - weights = d._collapse_optimize_weights(weights) + # ---------------------------------------------------------------- + # Attributes + # ---------------------------------------------------------------- + @property + @daskified(_DASKIFIED_VERBOSE) + def Units(self): + """The `cf.Units` object containing the units of the data array. - # Permute the order of the weight axes to be - # consistent with the order of the data axes - self_axes = d._axes - for key, w in tuple(weights.items()): - key1 = tuple([axis for axis in self_axes if axis in key]) + Can be set to any units equivalent to the existing units. - if key1 != key: - w = w.transpose([key.index(axis) for axis in key1]) + .. seealso `override_units`, `override_calendar` - del weights[key] - ikey = tuple([self_axes.index(axis) for axis in key1]) - - weights[ikey] = w - - # Add the weights to kwargs - kwargs["weights"] = weights - - # If the input data array 'fits' in one chunk of memory, then - # make sure that it has only one partition - if ( - not _preserve_partitions - and d._pmndim - and d.fits_in_one_chunk_in_memory(d.dtype.itemsize) - ): - d.varray - - # ------------------------------------------------------------- - # Initialise the output data array - # ------------------------------------------------------------- - new = d[(Ellipsis,) + (0,) * n_collapse_axes] - - # new._auxiliary_mask = None - for partition in new.partitions.matrix.flat: - # Do this so as not to upset the ref count on the - # parittion's of d - del partition.subarray - - # d.to_memory() - - # save = not new.fits_in_memory(new.dtype.itemsize) - keep_in_memory = new.fits_in_memory(new.dtype.itemsize) - - datatype = d.dtype - - if units is None: - new_units = new.Units - else: - new_units = units - - p_axes = new._axes[:n_non_collapse_axes] - p_units = new_units - - c_slice = (slice(None),) * n_collapse_axes - - config = new.partition_configuration( - readonly=False, auxiliary_mask=None, extra_memory=False # DCH ??x - ) - - processed_partitions = [] - for pmindex, partition in np.ndenumerate(new.partitions.matrix): - if partition._process_partition: - # Only process the partition if it is flagged - partition.open(config) - - # Save the position of the partition in the partition - # matrix - partition._pmindex = pmindex - - partition.axes = p_axes - partition.flip = [] - partition.part = [] - partition.Units = p_units - - if squeeze: - # Note: parentheses for line continuation (not a tuple): - partition.location = partition.location[ - :n_non_collapse_axes - ] - partition.shape = partition.shape[:n_non_collapse_axes] - - indices = partition.indices[:n_non_collapse_axes] + c_slice - - partition.subarray = d._collapse_subspace( - func, - fpartial, - ffinalise, - indices, - n_non_collapse_axes, - n_collapse_axes, - Nmax, - mtol, - _preserve_partitions=_preserve_partitions, - _parallelise_collapse_subspace=False, - **kwargs, - ) - - partition.close(keep_in_memory=keep_in_memory) - - # Add each partition to a list of processed partitions - processed_partitions.append(partition) - # --- End: if - # --- End: for - - # processed_partitions contains a list of all the partitions - # that have been processed on this rank. In the serial case - # this is all of them and this line of code has no - # effect. Otherwise the processed partitions from each rank - # are distributed to every rank and processed_partitions now - # contains all the processed partitions from every rank. - processed_partitions = self._share_partitions( - processed_partitions, False - ) - - # Put the processed partitions back in the partition matrix - # according to each partitions _pmindex attribute set above. - pm = new.partitions.matrix - for partition in processed_partitions: - pm[partition._pmindex] = partition - - p_datatype = partition.subarray.dtype - if datatype != p_datatype: - datatype = np.result_type(p_datatype, datatype) - # --- End: for - - new._Units = new_units - new.dtype = datatype - - if squeeze: - new._axes = p_axes - new._ndim = ndim - n_collapse_axes - new._shape = new._shape[: new._ndim] - else: - new_axes = new._axes - if new_axes != original_self_axes: - iaxes = [new_axes.index(axis) for axis in original_self_axes] - new.transpose(iaxes, inplace=True) - # --- End: if - - # ------------------------------------------------------------ - # Update d in place and return - # ------------------------------------------------------------ - d.__dict__ = new.__dict__ - - return d - - def _collapse_subspace( - self, - func, - fpartial, - ffinalise, - indices, - n_non_collapse_axes, - n_collapse_axes, - Nmax, - mtol, - weights=None, - _preserve_partitions=False, - _parallelise_collapse_subspace=True, - **kwargs, - ): - """Collapse a subspace of a data array. - - If set, *weights* and *kwargs* are passed to the function call. If - there is a *weights* keyword argument then this should either evaluate - to False or be a dictionary of weights for at least one of the data - dimensions. - - :Parameters: - - func : function - - fpartial : function - - ffinalise : function - - indices: tuple - The indices of the master array which would create the - subspace. - - n_non_collapse_axes : int - The number of data array axes which are not being - collapsed. It is assumed that they are in the slowest moving - positions. - - n_collapse_axes : int - The number of data array axes which are being collapsed. It is - assumed that they are in the fastest moving positions. - - weights : dict, optional - - kwargs : *optional* - - :Returns: - - `list` - - **Examples:** - - """ - - ndim = self._ndim - - master_shape = self.shape - - data = self[indices] - - # If the input data array 'fits' in one chunk of memory, then - # make sure that it has only one partition - if ( - not _preserve_partitions - and data._pmndim - and data.fits_in_memory(data.dtype.itemsize) - ): - data.varray - - # True iff at least two, but not all, axes are to be - # collapsed. - reshape = 1 < n_collapse_axes < ndim - - out = None - - if n_collapse_axes == ndim: - # All axes are to be collapsed - kwargs.pop("axis", None) - else: - # At least one axis, but not all axes, are to be - # collapsed. It is assumed that the collapse axes are in - # the last (fastest varying) positions (-1, -2, ...). We - # set kwargs['axis']=-1 (actually we use the +ve integer - # equivalent of -1) if there is more then one collapse - # axis because, in this case (i.e. reshape is True), we - # will reshape everything. - kwargs["axis"] = ndim - n_collapse_axes - - masked = False - - sub_samples = 0 - - # pda_args = data.pda_args(revert_to_file=True) #, readonly=True) - config = data.partition_configuration(readonly=True) - - # Flag which partitions will be processed on this rank. If - # _parallelise_collapse_subspace is False then all partitions - # will be flagged for processing. - data._flag_partitions_for_processing(_parallelise_collapse_subspace) - - for i, partition in enumerate(data.partitions.matrix.flat): - if partition._process_partition: - # Only process a partition if flagged - partition.open(config) - array = partition.array - - p_masked = partition.masked - - if p_masked: - masked = True - if array.mask.all(): - # The array is all missing data - partition.close() - continue - - # Still here? Then there are some non-missing sub-array - # elements. - if weights is not None: - w = self._collapse_create_weights( - array, - partition.indices, - indices, - master_shape, - weights, - n_non_collapse_axes, - n_collapse_axes, - ) - wmin = w.min() - if wmin < 0: - raise ValueError( - "Can't collapse with negative weights" - ) - - if wmin == 0: - # Mask the array where the weights are zero - array = np.ma.masked_where(w == 0, array, copy=True) - if array.mask.all(): - # The array is all missing data - partition.close() - continue - # --- End: if - - kwargs["weights"] = w - # --- End: if - - partition.close() - - if reshape: - # At least two, but not all, axes are to be collapsed - # => we need to reshape the array and the weights. - shape = array.shape - ndim = array.ndim - new_shape = shape[:n_non_collapse_axes] - new_shape += (reduce(mul, shape[n_non_collapse_axes:]),) - array = np.reshape(array.copy(), new_shape) - - if weights is not None: - w = kwargs["weights"] - if w.ndim < ndim: - # The weights span only collapse axes (as - # opposed to spanning all axes) - new_shape = (w.size,) - - kwargs["weights"] = np.reshape(w, new_shape) - # --- End: if - - p_out = func(array, masked=p_masked, **kwargs) - - if out is None: - if ( - not _parallelise_collapse_subspace - and data.partitions.size == i + 1 - ): - # There is exactly one partition so we are done - out = p_out - break - # --- End: if - out = fpartial(p_out) - else: - out = fpartial(out, p_out) - # --- End: if - - sub_samples += 1 - - # --- End: if - # --- End: for - - # In the case that the inner loop is not parallelised, - # just finalise. - out = self._collapse_finalise( - ffinalise, - out, - sub_samples, - masked, - Nmax, - mtol, - data, - n_non_collapse_axes, - ) - # # --- End: if - - return out - - @classmethod - def _collapse_finalise( - cls, - ffinalise, - out, - sub_samples, - masked, - Nmax, - mtol, - data, - n_non_collapse_axes, - ): - """Finalise a collapse over a data array.""" - if out is not None: - # Finalise - N, out = ffinalise(out, sub_samples) - out = cls._collapse_mask(out, masked, N, Nmax, mtol) - else: - # no data - return all masked - out = np.ma.masked_all( - data.shape[:n_non_collapse_axes], data.dtype - ) - - return out - - @staticmethod - def _collapse_mask(array, masked, N, Nmax, mtol): - """Re-masks a masked array to reflect a collapse. - - :Parameters: - - array: numpy array - - masked: bool - - N: numpy array-like - - Nmax: int - - mtol: numpy array-like - - :Returns: - - numpy array - - """ - if masked and mtol < 1: - x = N < (1 - mtol) * Nmax - if x.any(): - array = np.ma.masked_where(x, array, copy=False) - # --- End: if - - return array - - @staticmethod - def _collapse_create_weights( - array, - indices, - master_indices, - master_shape, - master_weights, - n_non_collapse_axes, - n_collapse_axes, - ): - """Collapse weights of an array. - - :Parameters: - - array : numpy array - - indices : tuple - - master_indices : tuple - - master_shape : tuple - - master_weights : dict - - n_non_collapse_axes : int - The number of array axes which are not being collapsed. It - is assumed that they are in the slowest moving positions. - - n_collapse_axes : int - The number of array axes which are being collapsed. It is - assumed that they are in the fastest moving positions. - - :Returns: - - `numpy array or `None` - - **Examples:** - - """ - array_shape = array.shape - array_ndim = array.ndim - - weights_indices = [] - for master_index, index, size in zip( - master_indices, indices, master_shape - ): - start, stop, step = master_index.indices(size) - - size1, mod = divmod(stop - start - 1, step) - - start1, stop1, step1 = index.indices(size1 + 1) - - size2, mod = divmod(stop1 - start1, step1) - - if mod != 0: - size2 += 1 - - start += start1 * step - step *= step1 - stop = start + (size2 - 1) * step + 1 - - weights_indices.append(slice(start, stop, step)) - - base_shape = (1,) * array_ndim - - masked = False - zero_weights = False - - weights = [] - for key, weight in master_weights.items(): - shape = list(base_shape) - index = [] - for i in key: - shape[i] = array_shape[i] - index.append(weights_indices[i]) - - weight = weight[tuple(index)].array - - zero_weights = zero_weights or (weight.min() <= 0) - - masked = masked or np.ma.isMA(weight) - - if weight.ndim != array_ndim: - # Make sure that the weight has the same number of - # dimensions as the array - weight = weight.reshape(shape) - - weights.append(weight) - - weights_out = weights[0] - - if len(weights) > 1: - # There are two or more weights, so create their product - # (can't do this in-place because of broadcasting woe) - for w in weights[1:]: - weights_out = weights_out * w - - weights_out_shape = weights_out.shape - - if ( - not masked - and weights_out_shape[:n_non_collapse_axes] - == base_shape[:n_non_collapse_axes] - ): - # The input weights are not masked and only span collapse axes - weights_out = weights_out.reshape( - weights_out_shape[n_non_collapse_axes:] - ) - - if ( - weights_out_shape[n_non_collapse_axes:] - != array_shape[n_non_collapse_axes:] - ): - # The input weights span some, but not all, of the - # collapse axes, so broadcast the weights over all - # collapse axes - weights_out = broadcast_array( - weights_out, array_shape[n_non_collapse_axes:] - ) - else: - if weights_out_shape != array_shape: - # Either a) The input weights span at least one - # non-collapse axis, so broadcast the weights over all - # axes or b) The weights contain masked values - weights_out = broadcast_array(weights_out, array_shape) - - if masked and np.ma.isMA(array): - if not (array.mask | weights_out.mask == array.mask).all(): - raise ValueError( - "The output weights mask {} is not compatible with " - "the array mask {}.".format( - weights_out.mask, array.mask - ) - ) - # --- End: if - - return weights_out - - def _collapse_optimize_weights(self, weights): - """Optimise when weights span only non-partitioned axes. - - weights: `dict` - - """ - non_partitioned_axes = set(self._axes).difference(self._pmaxes) - - x = [] - new_key = () - for key in weights: - if non_partitioned_axes.issuperset(key): - x.append(key) - new_key += key - # --- End: for - - if len(x) > 1: - reshaped_weights = [] - for key in x: - w = weights.pop(key) - w = w.array - shape = [ - (w.shape[key.index(axis)] if axis in key else 1) - for axis in new_key - ] - w = w.reshape(shape) - - reshaped_weights.append(w) - - # Create their product - new_weight = reshaped_weights[0] - for w in reshaped_weights[1:]: - new_weight = new_weight * w - - weights[new_key] = type(self)(new_weight) - - return weights - - # ---------------------------------------------------------------- - # Private attributes - # ---------------------------------------------------------------- - @property - def _Units(self): - """Storage for the units. - - The units are stored in a `Units` object, and reflect the - units of the (yet to be computed) elements of the underlying - data. - - .. warning:: Assigning to `_Units` does *not* trigger a units - conversion of the underlying data - values. Therefore assigning to `_Units` should - only be done in cases when it is known that the - intrinsic units represented by the data values - are inconsistent with the existing value of - `_Units`. Before assigning to `_Units`, first - consider if assigning to `Units`, or calling the - `override_units` or `override_calendar` method is - a more appropriate course of action, and use one - of those if possible. - - """ - return self._custom["_Units"] - - @_Units.setter - def _Units(self, value): - self._custom["_Units"] = value - - @_Units.deleter - def _Units(self): - self._custom["_Units"] = _units_None - - @property - def _cyclic(self): - """Storage for axis cyclicity. - - Contains a `set` that identifies which axes are cyclic (and - therefore allow cyclic slicing). The set contains a subset of - the axis identifiers defined by the `_axes` attribute. - - .. warning:: Never change the value of the `_cyclic` attribute - in-place. - - .. note:: When an axis identifier is removed from the `_axes` - attribute then it is automatically also removed from - the `_cyclic` attribute. - - """ - return self._custom["_cyclic"] - - @_cyclic.setter - def _cyclic(self, value): - self._custom["_cyclic"] = value - - @_cyclic.deleter - def _cyclic(self): - self._custom["_cyclic"] = _empty_set - - @property - @daskified(_DASKIFIED_VERBOSE) - def _hardmask(self): - """Storage for the mask hardness. - - Contains a `bool`, where `True` denotes a hard mask and - `False` denotes a soft mask. - - .. warning:: Assigning to `_hardmask` does *not* trigger a - hardening or softening of the mask of the - underlying data values. Therefore assigning to - `_hardmask` should only be done in cases when it - is known that the intrinsic mask hardness of the - data values is inconsistent with the - existing value of `_hardmask`. Before assigning - to `_hardmask`, first consider if assigning to - `hardmask`, or calling the `harden_mask` or - `soften_mask` method is a more appropriate course - of action, and use one of those if possible. - - See `hardmask` for details. - - """ - return self._custom["_hardmask"] - - @_hardmask.setter - def _hardmask(self, value): - self._custom["_hardmask"] = value - - @property - @daskified(_DASKIFIED_VERBOSE) - def _axes(self): - """Storage for the axis identifiers. - - Contains a `tuple` of identifiers, one for each array axis. - - .. note:: When the axis identifiers are reset, then any axis - identifier named by the `_cyclic` attribute which is - not in the new `_axes` set is automatically removed - from the `_cyclic` attribute. - - """ - return self._custom["_axes"] - - @_axes.setter - def _axes(self, value): - self._custom["_axes"] = tuple(value) - - # Remove cyclic axes that are not in the new axes - cyclic = self._cyclic - if cyclic: - # Never change the value of the _cyclic attribute in-place - self._cyclic = cyclic.intersection(value) - - # ---------------------------------------------------------------- - # Dask attributes - # ---------------------------------------------------------------- - @property - @daskified(_DASKIFIED_VERBOSE) - def chunks(self): - """The chunk sizes for each dimension. - - **Examples** - - >>> d = cf.Data.ones((4, 5), chunks=(2, 4)) - >>> d.chunks - ((2, 2), (4, 1)) - - """ - return self._get_dask().chunks - - @chunks.setter - def chunks(self, chunks): - raise TypeError( - "Can't set chunks directly. Use the 'rechunk' method instead." - ) - - @property - def force_compute(self): - """TODODASK See also confg settings.""" - return self._custom.get("force_compute", False) - - @force_compute.setter - def force_compute(self, value): - self._custom["force_compute"] = bool(value) - - # ---------------------------------------------------------------- - # Attributes - # ---------------------------------------------------------------- - @property - @daskified(_DASKIFIED_VERBOSE) - def Units(self): - """The `cf.Units` object containing the units of the data array. - - Can be set to any units equivalent to the existing units. - - .. seealso `override_units`, `override_calendar` - - **Examples:** + **Examples:** >>> d = cf.Data([1, 2, 3], units='m') >>> d.Units @@ -6034,12 +5241,9 @@ def cf_Units(x): x=x, from_units=old_units, to_units=value, inplace=False ) - self._map_blocks( - cf_Units, - delete_source=False, - reset_mask_hardness=False, - dtype=dtype, - ) + dx = self.to_dask_array() + dx = dx.map_blocks(cf_Units, dtype=dtype) + self._set_dask(dx, reset_mask_hardness=False) self._Units = value @@ -7635,7 +6839,6 @@ def argmax(self, axis=None, unravel=False): is located. unravel: `bool`, optional - If True then when locating the maximum over the whole data, return the location as an index for each axis as a `tuple`. By default an index to the flattened array @@ -7830,122 +7033,178 @@ def set_units(self, value): """ self.Units = Units(value, self.get_calendar(default=None)) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") - def maximum( + def max( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with their maximum. + """Calculate maximum values. - Missing data array elements are omitted from the calculation. + Calculates the maximum value or the maximum values along axes. - .. seealso:: `minimum`, `mean`, `mid_range`, `sum`, `sd`, `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `maximum_absolute_value`, `min` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} - squeeze : bool, optional + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.max() + """ - # TODODASK: Placeholder for the real thing, that takes into - # account axes=axes, squeeze=squeeze, mtol=mtol, - # inplace=inplace. - # - # This is only here for now, in this form, to ensure that - # cf.read works - return self.get_dask(copy=False).max() + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.max, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, + mtol=mtol, + ) - # return self._collapse(max_f, max_fpartial, max_ffinalise, axes=axes, - # squeeze=squeeze, mtol=mtol, inplace=inplace, - # _preserve_partitions=_preserve_partitions) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def maximum_absolute_value( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Collapse axes with their maximum absolute value. + """Calculate maximum absolute values. - Missing data elements are omitted from the calculation. + Calculates the maximum absolute value or the maximum absolute + values along axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sum`, `sd`, - `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `max`, `minimum_absolute_value` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} - squeeze : bool, optional + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[-99 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.maximum_absolute_value() - - >>> d.max() - - >>> d.maximum_absolute_value(axes=1) - - >>> d.max(axes=1) - + """ - return self._collapse( - max_abs_f, - max_abs_fpartial, - max_abs_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.max_abs, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") - def minimum( + def min( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, i=False, _preserve_partitions=False, ): - """Collapse axes with their minimum. + """Calculate minimum values. + + Calculates the minimum value or the minimum values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - Missing data array elements are omitted from the calculation. + ..seealso:: `sample_size`, `max`, `minimum_absolute_value` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} - squeeze : bool, optional + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -7954,133 +7213,139 @@ def minimum( :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - .. seealso:: `maximum`, `mean`, `mid_range`, `sum`, `sd`, `var` + **Examples** - **Examples:** + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.min() + """ - return self._collapse( - min_f, - min_fpartial, - min_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.min, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def minimum_absolute_value( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Collapse axes with their minimum absolute value. + """Calculate minimum absolute values. + + Calculates the minimum absolute value or the minimum absolute + values along axes. - Missing data elements are omitted from the calculation. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sum`, `sd`, - `var` + ..seealso:: `sample_size`, `maximum_absolute_value`, `min` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} - squeeze : bool, optional + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[0, 0] = -99 + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[-99 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.minimum_absolute_value() - - >>> d.d.min() - - >>> d.minimum_absolute_value(axes=1) - - >>> d.min(axes=1) - - - """ - return self._collapse( - min_abs_f, - min_abs_fpartial, - min_abs_ffinalise, - axes=axes, - squeeze=squeeze, + + + """ + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.min_abs, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def mean( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with their mean. + """Calculate mean values. + + Calculates the mean value or the mean values along axes. - The mean is unweighted by default, but may be weighted (see the - *weights* parameter). + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - Missing data array elements and their corresponding weights - are omitted from the calculation. + ..seealso:: `sample_size`, `mean_abslute_value`, `sd`, `sum` :Parameters: - axes: (sequence of) int, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + {{collapse axes: (sequence of) `int`, optional}} - squeeze: `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1, meaning that the result is guaranteed to - broadcast correctly against the original array. - - weights: data-like or dict, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - a weight equal to one. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values. + {{weights: data_like, `dict`, or `None`, optional}} - *Parameter example:* - If ``weights={1: w, (2, 0): x}`` then ``w`` must contain - 1-dimensional weights for axis 1 and ``x`` must contain - 2-dimensional weights for axes 2 and 0. This is - equivalent, for example, to ``weights={(1, 2, 0), y}``, - where ``y`` is the outer product of ``w`` and ``x``. If - ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is - equivalent to ``weights=y``. If ``axes=None`` and the - array is 3-dimensional then ``weights={(1, 2, 0), y}`` - is equivalent to ``weights=y.transpose([2, 0, 1])``. - - mtol: number, optional + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -8089,143 +7354,76 @@ def mean( :Returns: `Data` or `None` - The collapsed array. - - .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, - `var` + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[1, 2, 4], [1, 4, 9]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked >>> print(d.array) - [[1 2 4] - [1 4 9]] - + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.mean() - - >>> d.mean(squeeze=True) - - >>> d.mean(axes=[0, 1]) - - >>> d.mean(axes=[1, 0]) - - >>> print(d.mean(axes=0).array) - [[1. 3. 6.5]] - >>> print(d.mean(axes=1).array) - [[2.33333333] - [4.66666667]] - >>> d.mean(axes=1, squeeze=True) - - - >>> y = cf.Data([1, 3]) - >>> x = cf.Data([1, 2, 1]) - >>> w = cf.Data.insert_dimension(y, 1) * x - >>> print(w.array) - [[1 2 1] - [3 6 3]] + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] >>> d.mean(weights=w) - - >>> d.mean(weights={(0, 1): w}) - - >>> d.mean(axes=[0, 1], weights={(0, 1): w}) - - >>> d.mean(axes=[1, 0], weights={(0, 1): w}) - - >>> d.mean(axes=(0, 1), weights={1: x, 0: y}) - - - >>> d.mean(axes=1, weights=w) - - >>> d.mean(axes=1, weights=x) - - >>> d.mean(axes=1, weights={1: x}) - - >>> d.mean(axes=1, weights={(0, 1): w}) - - >>> d.mean(axes=1, weights={0: y, (1,): x}) - - - >>> d.mean(axes=1) - - >>> d.mean(axes=1, weights={0: y}) - - - >>> e = cf.Data(numpy.arange(24).reshape(3, 2, 4)) - >>> print(e.array) - [[[ 0 1 2 3] - [ 4 5 6 7]] - [[ 8 9 10 11] - [12 13 14 15]] - [[16 17 18 19] - [20 21 22 23]]] - - >>> e.mean(axes=[0, 2]) - - >>> f = e.mean(axes=[0, 2], squeeze=True) - >>> f - - >>> f.shape - (2,) - >>> print(e.mean(axes=[0, 1]).array) - [[[10. 11. 12. 13.]]] - >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) - [[[11. 12. 13. 14.]]] - - >>> e[0, 0] = cf.masked - >>> e[-1, -1] = cf.masked - >>> e[..., 2] = cf.masked - >>> print(e.array) - [[[-- -- -- --] - [4 5 -- 7]] - [[8 9 -- 11] - [12 13 -- 15]] - [[16 17 -- 19] - [-- -- -- --]]] - - >>> e.mean() - - >>> print(e.mean(axes=[0, 1]).array) - [[[10.0 11.0 -- 13.0]]] - >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) - [[[9.666666666666666 10.666666666666666 -- 12.666666666666666]]] - - """ - return self._collapse( - mean_f, - mean_fpartial, - mean_ffinalise, - axes=axes, - squeeze=squeeze, + + + """ + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.mean, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def mean_absolute_value( self, axes=None, squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Collapse axes with their mean absolute value. + """Calculate mean absolute values. + + Calculates the mean absolute value or the mean absolute values + along axes. - Missing data elements are omitted from the calculation. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sum`, `sd`, - `var` + ..seealso:: `sample_size`, `mean`, `sd`, `sum` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} - weights: + {{weights: data_like, `dict`, or `None`, optional}} - squeeze : bool, optional + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -8235,158 +7433,203 @@ def mean_absolute_value( The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[0, 0] = -99 + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[-99 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.mean_absolute_value() - - >>> d.mean_absolute_value(axes=1) - + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.mean_absolute_value(weights=w) + """ - return self._collapse( - mean_abs_f, - mean_abs_fpartial, - mean_abs_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.mean_abs, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def integral( self, axes=None, squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, _preserve_partitions=False, ): - """Collapse axes with their integral. + """Calculate summed values. + + Calculates the sum value or the sum values along axes. - If weights are not provided then all non-missing elements are - given weighting of one such that the collapse method becomes - a `sum`. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `mean`, `sd`, `sum` :Parameters: - axes: (sequence of) int, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + {{collapse axes: (sequence of) `int`, optional}} - squeeze: `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1, meaning that the result is guaranteed to - broadcast correctly against the original array. - - weights: data-like or dict, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - a weight equal to one. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values. - - Note that the units of the weights matter for an integral - collapse, which differs from a weighted sum in that the units - of the weights are incorporated into the result. + {{weights: data_like, `dict`, or `None`, optional}} - *Parameter example:* - If ``weights={1: w, (2, 0): x}`` then ``w`` must contain - 1-dimensional weights for axis 1 and ``x`` must contain - 2-dimensional weights for axes 2 and 0. This is - equivalent, for example, to ``weights={(1, 2, 0), y}``, - where ``y`` is the outer product of ``w`` and ``x``. If - ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is - equivalent to ``weights=y``. If ``axes=None`` and the - array is 3-dimensional then ``weights={(1, 2, 0), y}`` - is equivalent to ``weights=y.transpose([2, 0, 1])``. - - mtol: number, optional + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` - The collapsed data, or `None` of the operation was + The collapsed data, or `None` if the operation was in-place. - .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, - `var` + **Examples** - **Examples:** + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.integral() + - """ - if weights is None: - units = None - else: - units = self.Units - if not units: - units = Units("1") + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.integral(weights=w) + - weights_units = getattr(weights, "Units", None) - if weights_units is not None: - units = units * weights_units - else: - for w in weights.values(): - weights_units = getattr(w, "Units", None) - if weights_units is not None: - units = units * weights_units - # --- End: if + >>> d.integral(weights=cf.Data(w, 'm')) + - return self._collapse( - sum_f, - sum_fpartial, - sum_ffinalise, - axes=axes, - squeeze=squeeze, + """ + d = _inplace_enabled_define_and_cleanup(self) + d, weights = _collapse( + Collapse.sum, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - units=units, - _preserve_partitions=_preserve_partitions, ) + new_units = None + if weights is not None: + weights_units = getattr(weights, "Units", None) + if weights_units: + units = self.Units + if units: + new_units = units * weights_units + else: + new_units = weights_units + + if new_units is not None: + d.override_units(new_units, inplace=True) + + return d + + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") def sample_size( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapses axes with their sample size. + """Calculate sample size values. + + The sample size is the number of non-missing values. + + Calculates the sample size value or the sample size values + along axes. + + .. seealso:: `sum_of_weights` :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + {{inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sample_size() + + """ - return self._collapse( - sample_size_f, - sample_size_fpartial, - sample_size_ffinalise, - axes=axes, - squeeze=squeeze, - weights=None, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.sample_size, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - units=Units("1"), - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + d.override_units(_units_None, inplace=True) + + return d @property @daskified(_DASKIFIED_VERBOSE) @@ -8765,7 +8008,8 @@ def count(self): 8 """ - # TODODASK - daskify, previously parallelise=mpi_on (not =False) + # TODODASK - simply use da.ma.count (dask>=2022.3.1) + config = self.partition_configuration(readonly=True) n = 0 @@ -9848,12 +9092,9 @@ def harden_mask(self): [1 -- 3] """ - self._map_blocks( - cf_harden_mask, - delete_source=False, - reset_mask_hardness=False, - dtype=self.dtype, - ) + dx = self.to_dask_array() + dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) + self._set_dask(dx, delete_source=False, reset_mask_hardness=False) self._hardmask = True def has_calendar(self): @@ -9948,12 +9189,9 @@ def soften_mask(self): [ 1 999 3] """ - self._map_blocks( - cf_soften_mask, - delete_source=False, - reset_mask_hardness=False, - dtype=self.dtype, - ) + dx = self.to_dask_array() + dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) + self._set_dask(dx, delete_source=False, reset_mask_hardness=False) self._hardmask = False @daskified(_DASKIFIED_VERBOSE) @@ -10006,7 +9244,9 @@ def filled(self, fill_value=None, inplace=False): f"data type {d.dtype.str!r}" ) - d._map_blocks(np.ma.filled, fill_value=fill_value, dtype=d.dtype) + dx = d.to_dask_array() + dx = dx.map_blocks(np.ma.filled, fill_value=fill_value, dtype=d.dtype) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -10548,19 +9788,36 @@ def override_calendar(self, calendar, inplace=False, i=False): {{inplace: `bool`, optional}} - {{i: deprecated at version 3.0.0}} + {{i: deprecated at version 3.0.0}} + + :Returns: + + `Data` or `None` + + **Examples:** + + """ + d = _inplace_enabled_define_and_cleanup(self) + d._Units = Units(d.Units._units, calendar) + + return d + + def to_dask_array(self): + """Store the data array on disk. + + There is no change to partitions whose sub-arrays are already on + disk. :Returns: - `Data` or `None` + `None` **Examples:** - """ - d = _inplace_enabled_define_and_cleanup(self) - d._Units = Units(d.Units._units, calendar) + >>> d.to_disk() - return d + """ + return self._get_dask() def to_disk(self): """Store the data array on disk. @@ -11014,28 +10271,42 @@ def masked_all(cls, shape, dtype=None, units=None, chunk=True): return cls(array, units=units, chunk=chunk) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def mid_range( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, - _preserve_partitions=False, i=False, ): - """Collapse axes with the unweighted average of their maximum - and minimum values. + """Calculate mid-range values. + + The mid-range is half of the maximum plus the minimum. - Missing data array elements are omitted from the calculation. + Calculates the mid-range value or the mid-range values along + axes. - .. seealso:: `maximum`, `minimum`, `mean`, `range`, `sum`, `sd`, `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `max`, `min`, `range` :Parameters: - axes: (sequence of) `int`, optional + {{collapse axes: (sequence of) `int`, optional}} - squeeze: `bool`, optional + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -11046,19 +10317,30 @@ def mid_range( `Data` or `None` The collapsed array. - **Examples:** + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.mid_range() + """ - return self._collapse( - mid_range_f, - mid_range_fpartial, - mid_range_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.mid_range, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @@ -11205,6 +10487,71 @@ def isclose(self, y, rtol=None, atol=None): except (TypeError, NotImplementedError, IndexError): return self == y + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + def reshape(self, *shape, merge_chunks=True, limit=None, inplace=False): + """Change the shape of the data without changing its values. + + :Parameters: + + shape: `tuple` of `int`, or any number of `int` + The new shape for the data, which should be compatible + with the original shape. If an integer, then the + result will be a 1-d array of that length. One shape + dimension can be -1, in which case the value is + inferred from the length of the array and remaining + dimensions. + + merge_chunks: `bool` + When True (the default) merge chunks using the logic + in `dask.array.rechunk` when communication is + necessary given the input array chunking and the + output shape. When False, the input array will be + rechunked to a chunksize of 1, which can create very + many tasks. See `dask.array.reshape` for details. + + limit: int, optional + The maximum block size to target in bytes. If no limit + is provided, it defaults to the configuration value + ``dask.config.get('array.chunk-size')``. See + `dask.array.reshape` for details. + + :Returns: + + `Data` or `None` + The reshaped data, or `None` if the operation was + in-place. + + **Examples** + + >>> d = cf.Data(np.arange(12)) + >>> print(d.array) + [ 0 1 2 3 4 5 6 7 8 9 10 11] + >>> print(d.reshape(3, 4).array) + [[ 0 1 2 3] + [ 4 5 6 7] + [ 8 9 10 11]] + >>> print(d.reshape((4, 3)).array) + [[ 0 1 2] + [ 3 4 5] + [ 6 7 8] + [ 9 10 11]] + >>> print(d.reshape(-1, 6).array) + [[ 0 1 2 3 4 5] + [ 6 7 8 9 10 11]] + >>> print(d.reshape(1, 1, 2, 6).array) + [[[[ 0 1 2 3 4 5] + [ 6 7 8 9 10 11]]]] + >>> print(d.reshape(1, 1, -1).array) + [[[[ 0 1 2 3 4 5 6 7 8 9 10 11]]]] + + """ + d = _inplace_enabled_define_and_cleanup(self) + dx = d._get_dask() + dx = dx.reshape(*shape, merge_chunks=merge_chunks, limit=limit) + d._set_dask(dx, reset_mask_hardness=True) + return d + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) @@ -11241,84 +10588,79 @@ def rint(self, inplace=False, i=False): d._set_dask(da.rint(dx), reset_mask_hardness=False) return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def root_mean_square( self, axes=None, squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Collapse axes with their root mean square. + """Calculate root mean square (RMS) values. + + Calculates the RMS value or the RMS values along axes. - Missing data array elements and their corresponding weights are - omitted from the calculation. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `mean`, `sum`, :Parameters: - axes: (sequence of) int, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + {{collapse axes: (sequence of) `int`, optional}} - squeeze: `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1, meaning that the result is guaranteed to - broadcast correctly against the original array. - - weights: data-like or dict, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - a weight equal to one. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values. + {{weights: data_like, `dict`, or `None`, optional}} - *Parameter example:* - If ``weights={1: w, (2, 0): x}`` then ``w`` must contain - 1-dimensional weights for axis 1 and ``x`` must contain - 2-dimensional weights for axes 2 and 0. This is - equivalent, for example, to ``weights={(1, 2, 0), y}``, - where ``y`` is the outer product of ``w`` and ``x``. If - ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is - equivalent to ``weights=y``. If ``axes=None`` and the - array is 3-dimensional then ``weights={(1, 2, 0), y}`` - is equivalent to ``weights=y.transpose([2, 0, 1])``. - - mtol: number, optional + {{collapse squeeze: `bool`, optional}} - {{inplace: `bool`, optional}} + {{mtol: number, optional}} - {{i: deprecated at version 3.0.0}} + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + {{inplace: `bool`, optional}} :Returns: `Data` or `None` The collapsed array. - .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, - `var` + **Examples** - **Examples:** + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.root_mean_square() + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.root_mean_square(weights=w) + """ - return self._collapse( - root_mean_square_f, - root_mean_square_fpartial, - root_mean_square_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.rms, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @@ -11390,7 +10732,7 @@ def stats( sum=False, sum_of_squares=False, variance=False, - weights=False, + weights=None, ): """Calculate statistics of the data. @@ -11467,17 +10809,7 @@ def stats( Calculate the square root of the weighted or unweighted mean of the squares of the values. - weights: data-like or dict, optional - The weights to apply to the calculations. By default the - statistics are unweighted. - - The weights may be contained in any scalar or array-like - object (such as a numpy array or `Data` instance) that is - broadcastable to the shape of the data. If *weights* is a - dictionary then each key is axes of the array (an `int` or - `tuple` of `int`) with a corresponding data-like value of - weights for those axes. In this case, the implied weights - array is the outer product of the dictionary's values. + {{weights: data_like, `dict`, or `None`, optional}} :Returns: @@ -11529,15 +10861,14 @@ def stats( 'sample_size': 5} """ - no_weights = ( "minimum", + "median", "maximum", "range", "mid_range", "minimum_absolute_value", "maximum_absolute_value", - "median", "sum", "sum_of_squares", ) @@ -11561,14 +10892,13 @@ def stats( "variance", ): if all or locals()[stat]: - f = getattr(self, stat) + func = getattr(self, stat) if stat in no_weights: - value = f(squeeze=True) + value = func(squeeze=True) else: - value = f(squeeze=True, weights=weights) + value = func(squeeze=True, weights=weights) out[stat] = value - # --- End: for if all or sample_size: out["sample_size"] = int(self.sample_size()) @@ -12847,27 +12177,42 @@ def func( return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def range( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, - _preserve_partitions=False, i=False, ): - """Collapse axes with the absolute difference between their - maximum and minimum values. + """Calculate range values. + + The range is the maximum minus the minimum. - Missing data array elements are omitted from the calculation. + Calculates the range value or the range values along axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sample_size`, - `sd`, `sum`, `sum_of_weights`, `sum_of_weights2`, - `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `max`, `min`, `mid_range` :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + {{inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} @@ -12877,20 +12222,30 @@ def range( `Data` or `None` The collapsed array. - **Examples:** + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.range() + """ - return self._collapse( - range_f, - range_fpartial, - range_ffinalise, - axes=axes, - squeeze=squeeze, - weights=None, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.range, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @@ -12943,30 +12298,43 @@ def roll(self, axis, shift, inplace=False, i=False): return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def sum( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with their sum. + """Calculate sum values. - Missing data array elements are omitted from the calculation. + Calculates the sum value or the sum values along axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `range`, - `sample_size`, `sd`, `sum_of_weights`, - `sum_of_weights2`, `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `integral`, `mean`, `sd`, + `sum_of_squares`, `sum_of_weights` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} - squeeze : bool, optional + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -12975,45 +12343,77 @@ def sum( :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sum() + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum(weights=cf.Data(w, 'm')) + """ - return self._collapse( - sum_f, - sum_fpartial, - sum_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.sum, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def sum_of_squares( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Collapse axes with the sum of the squares of the values. + """Calculate sums of squares. + + Calculates the sum of squares or the sum of squares values + along axes. - Missing data array elements are omitted from the calculation. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `range`, - `sample_size`, `sd`, `sum_of_weights`, - `sum_of_weights2`, `var` + ..seealso:: `sample_size`, `sum`, `sum_of_squares`, + `sum_of_weights2` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} - squeeze : bool, optional + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -13023,55 +12423,82 @@ def sum_of_squares( The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.sum_of_squares() - - >>> d.sum_of_squares(axes=1) - + - """ - units = self.Units - if units: - units = units ** 2 + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum_of_squares(weights=w) + - return self._collapse( - sum_of_squares_f, - sum_of_squares_fpartial, - sum_of_squares_ffinalise, + """ + d = _inplace_enabled_define_and_cleanup(self) + d.square(inplace=True) + d.sum( axes=axes, - squeeze=squeeze, weights=weights, - units=units, + squeeze=squeeze, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, + split_every=split_every, + inplace=True, ) + return d + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") + @_inplace_enabled(default=False) def sum_of_weights( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with the sum of weights. + """Calculate sums of weights. - Missing data array elements are omitted from the calculation. + Calculates the sum of weights or the sum of weights values + along axes. - .. seealso:: `maximum`, `mean`, `mid_range`, `minimum`, `range`, - `sample_size`, `sd`, `sum`, `sum_of_weights2`, `var` + The weights given by the *weights* parameter are internally + broadcast to the shape of the data, and those weights that are + missing data, or that correspond to the missing elements of + the data, are assigned a weight of 0. It is these processed + weights that are summed. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `sum`, `sum_of_squares`, + `sum_of_weights2` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} - squeeze : bool, optional + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {[inplace: `bool`, optional}} @@ -13080,226 +12507,196 @@ def sum_of_weights( :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** - """ - if weights is None: - units = Units() - else: - weights_units = getattr(weights, "Units", None) - if weights_units is not None: - units = weights_units - else: - units = Units("1") - for w in weights.values(): - weights_units = getattr(w, "Units", None) - if weights_units is not None: - units = units * weights_units - # --- End: if + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sum_of_weights() + - return self._collapse( - sw_f, - sw_fpartial, - sw_ffinalise, - axes=axes, - squeeze=squeeze, + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum_of_weights(weights=w) + + + >>> d.sum_of_weights(weights=cf.Data(w, 'm')) + + + """ + d = _inplace_enabled_define_and_cleanup(self) + d, weights = _collapse( + Collapse.sum_of_weights, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - units=units, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + units = _units_None + if weights is not None: + units = getattr(weights, "Units", None) + if units is None: + units = _units_None + + d.override_units(units, inplace=True) + + return d + + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") + @_inplace_enabled(default=False) def sum_of_weights2( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with the sum of squares of weights. + """Calculate sums of squares of weights. + + Calculates the sum of squares of weights or the sum of squares + of weights values along axes. - Missing data array elements are omitted from the calculation. + The weights given by the *weights* parameter are internally + broadcast to the shape of the data, and those weights that + are missing data, or that correspond to the missing elements + of the data, are assigned a weight of 0. It is these processed + weights that are squared and summed. - .. seealso:: `maximum`, `mean`, `mid_range`, `minimum`, `range`, - `sample_size`, `sd`, `sum`, `sum_of_weights`, `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `sum`, `sum_of_squares`, + `sum_of_weights` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} - squeeze : bool, optional + {{weights: data_like, `dict`, or `None`, optional}} - {{inplace: `bool`, optional}} + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + {[inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** - """ - if weights is None: - units = Units() - else: - weights_units = getattr(weights, "Units", None) - if weights_units is not None: - units = weights_units - else: - units = Units("1") - for w in weights.values(): - weights_units = getattr(w, "Units", None) - if weights_units is not None: - units = units * (weights_units ** 2) - # --- End: if + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sum_of_weights2() + - return self._collapse( - sw2_f, - sw2_fpartial, - sw2_ffinalise, - axes=axes, - squeeze=squeeze, + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum_of_weights2(weights=w) + + + >>> d.sum_of_weights2(weights=cf.Data(w, 'm')) + + + """ + d = _inplace_enabled_define_and_cleanup(self) + d, weights = _collapse( + Collapse.sum_of_weights2, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - units=units, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + units = _units_None + if weights is not None: + units = getattr(weights, "Units", None) + if not units: + units = _units_None + else: + units = units ** 2 + + d.override_units(units, inplace=True) + + return d + + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") - def sd( + @_inplace_enabled(default=False) + def std( self, axes=None, squeeze=False, mtol=1, weights=None, ddof=0, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - r"""Collapse axes by calculating their standard deviation. - - The standard deviation may be adjusted for the number of degrees of - freedom and may be calculated with weighted values. - - Missing data array elements and those with zero weight are omitted - from the calculation. - - The unweighted standard deviation, :math:`s`, of :math:`N` values - :math:`x_i` with mean :math:`m` and with :math:`N-ddof` degrees of - freedom (:math:`ddof\ge0`) is: - - .. math:: s=\sqrt{\\frac{1}{N-ddof} \sum_{i=1}^{N} (x_i - m)^2} - - The weighted standard deviation, :math:`\\tilde{s}_N`, of :math:`N` - values :math:`x_i` with corresponding weights :math:`w_i`, weighted - mean :math:`\\tilde{m}` and with :math:`N` degrees of freedom is: - - .. math:: \\tilde{s}_N=\sqrt{\\frac{1}{\sum_{i=1}^{N} w_i} - \sum_{i=1}^{N} w_i(x_i - \\tilde{m})^2} - - The weighted standard deviation, :math:`\\tilde{s}`, of :math:`N` - values :math:`x_i` with corresponding weights :math:`w_i` and with - :math:`N-ddof` degrees of freedom (:math:`ddof>0`) is: - - .. math:: \\tilde{s} = \sqrt{\\frac{f \sum_{i=1}^{N} w_i}{f - \sum_{i=1}^{N} w_i - ddof}} \\tilde{s}_N - - where :math:`f` is the smallest positive number whose product with - each weight is an integer. :math:`f \sum_{i=1}^{N} w_i` is the - size of a new sample created by each :math:`x_i` having - :math:`fw_i` repeats. In practice, :math:`f` may not exist or may - be difficult to calculate, so :math:`f` is either set to a - predetermined value or an approximate value is calculated. The - approximation is the smallest positive number whose products with - the smallest and largest weights and the sum of the weights are - all integers, where a positive number is considered to be an - integer if its decimal part is sufficiently small (no greater than - :math:`10^{-8}` plus :math:`10^{-5}` times its integer part). This - approximation will never overestimate :math:`f`, so - :math:`\\tilde{s}` will never be underestimated when the - approximation is used. If the weights are all integers which are - collectively coprime then setting :math:`f=1` will guarantee that - :math:`\\tilde{s}` is exact. + r"""Calculate standard deviations. + + Calculates the standard deviation of an array or the standard + deviations along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `mean`, `sum`, `var` :Parameters: - axes : (sequence of) `int`, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + {{collapse axes: (sequence of) `int`, optional}} - squeeze : `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1. When the collapsed axes are retained, the - result is guaranteed to broadcast correctly against the - original array. + {{weights: data_like, `dict`, or `None`, optional}} - *Parameter example:* - Suppose that an array, ``d``, has shape (2, 3, 4) and - ``e = d.sd(axis=1)``. Then ``e`` has shape (2, 1, 4) - and, for example, ``d/e`` is allowed. If ``e = - d.sd(axis=1, squeeze=True)`` then ``e`` will have shape - (2, 4) and ``d/e`` is an illegal operation. - - weights : data-like or `dict`, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - equal weights of 1. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values it may be used in - conjunction with any value of *axes*, because the axes to - which the weights apply are given explicitly. + {{collapse squeeze: `bool`, optional}} - *Parameter example:* - Suppose that the original array being collapsed has - shape (2, 3, 4) and *weights* is set to a data-like - object, ``w``. If ``axes=None`` then ``w`` must have - shape (2, 3, 4). If ``axes=(0, 1, 2)`` then ``w`` must - have shape (2, 3, 4). If ``axes=(2, 0, 1)`` then ``w`` - must either have shape (2, 3, 4) or else (4, 2, 3). If - ``axes=1`` then ``w`` must either have shape (2, 3, 4) - or else (3,). If ``axes=(2, 0)`` then ``w`` must either - have shape (2, 3, 4) or else (4, 2). Suppose *weights* - is a dictionary. If ``weights={1: x}`` then ``x`` must - have shape (3,). If ``weights={1: x, (2, 0): y}`` then - ``x`` must have shape (3,) and ``y`` must have shape (4, - 2). The last example is equivalent to ``weights={(1, 2, - 0): x.outerproduct(y)}`` (see `outerproduct` for - details). - - mtol : number, optional - For each element in the output data array, the fraction of - contributing input array elements which is allowed to - contain missing data. Where this fraction exceeds *mtol*, - missing data is returned. The default is 1, meaning a - missing datum in the output array only occurs when its - contributing input array elements are all missing data. A - value of 0 means that a missing datum in the output array - occurs whenever any of its contributing input array - elements are missing data. Any intermediate value is - permitted. - - ddof : number, optional - The delta degrees of freedom. The number of degrees of - freedom used in the calculation is (N-*ddof*) where N - represents the number of elements. By default *ddof* is 0 + {{mtol: number, optional}} + + {{ddof: number}} + + By default *ddof* is 0. + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -13307,66 +12704,87 @@ def sd( :Returns: - `Data` or `None` + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.std() + + >>> d.std(ddof=1) + - **Examples:** + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.std(ddof=1, weights=w) + - >>> d = cf.Data([1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4]) - >>> e = cf.Data([1, 2, 3, 4]) - >>> d.sd(squeeze=False) - - >>> d.sd() - - >>> e.sd(weights=[2, 3, 5, 6]) - - >>> e.sd(weights=[2, 3, 5, 6], f=1) - - >>> d.sd(ddof=0) - - >>> e.sd(ddof=0, weights=[2, 3, 5, 6]) - - - """ - return self._collapse( - sd_f, - sd_fpartial, - sd_ffinalise, + """ + d = _inplace_enabled_define_and_cleanup(self) + d.var( axes=axes, - squeeze=squeeze, weights=weights, + squeeze=squeeze, mtol=mtol, ddof=ddof, - inplace=inplace, - _preserve_partitions=_preserve_partitions, + split_every=split_every, + inplace=True, ) + d.sqrt(inplace=True) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def var( self, axes=None, - squeeze=False, weights=None, + squeeze=False, mtol=1, ddof=0, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with their weighted variance. + """Calculate variances. - The units of the returned array are the square of the units of the - array. + Calculates the variance of an array or the variance values + along axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `range`, `sum`, - `sd`, `stats` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `mean`, `sd`, `sum` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{ddof: number}} + + By default *ddof* is 0. - squeeze : bool, optional + {{split_every: `int` or `dict`, optional}} - weights : + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -13375,29 +12793,49 @@ def var( :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** - """ - units = self.Units - if units: - units = units ** 2 + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.var() + + >>> d.var(ddof=1) + - return self._collapse( - var_f, - var_fpartial, - var_ffinalise, - axes=axes, - squeeze=squeeze, + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.var(ddof=1, weights=w) + + + """ + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.var, + d, + axis=axes, weights=weights, + keepdims=not squeeze, mtol=mtol, - units=units, ddof=ddof, - inplace=inplace, - _preserve_partitions=_preserve_partitions, + split_every=split_every, ) + units = d.Units + if units: + d.override_units(units ** 2, inplace=True) + + return d + @daskified(_DASKIFIED_VERBOSE) def section( self, axes, stop=None, chunks=False, min_step=1, mode="dictionary" @@ -13488,52 +12926,210 @@ def section( return _section(self, axes, min_step=min_step) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + def square(self, dtype=None, inplace=False): + """Calculate the element-wise square. + + .. versionadded:: TODODASK + + .. seealso:: `sqrt`, `sum_of_squares` + + :Parameters: + + dtype: data-type, optional + Overrides the data type of the output arrays. A + matching precision of the calculation should be + chosen. For example, a *dtype* of ``'int32'`` is only + allowed when the input values are integers. + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The element-wise square of the data, or `None` if the + operation was in-place. + + **Examples** + + >>> d = cf.Data([[0, 1, 2.5, 3, 4]], 'K', mask=[[0, 0, 0, 1, 0]]) + >>> print(d.array) + [[0.0 1.0 2.5 -- 4.0]] + >>> e = d.square() + >>> e + + >>> print(e.array) + [[0.0 1.0 6.25 -- 16.0]] + + """ + d = _inplace_enabled_define_and_cleanup(self) + dx = d.to_dask_array() + dx = da.square(dx, dtype=dtype) + d._set_dask(dx, reset_mask_hardness=False) + + units = d.Units + if units: + d.override_units(units ** 2, inplace=True) + + return d + + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + def sqrt(self, dtype=None, inplace=False): + """Calculate the non-negative square root. + + .. versionadded:: TODODASK + + .. seealso:: `square` + + :Parameters: + + dtype: data-type, optional + Overrides the data type of the output arrays. A + matching precision of the calculation should be + chosen. For example, a *dtype* of ``'int32'` is not + allowed, even if the input values are perfect squares. + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The element-wise positive square root of the data, or + `None` if the operation was in-place. + + **Examples** + + >>> d = cf.Data([[0, 1, 2, 3, 4]], 'K2', mask=[[0, 0, 0, 1, 0]]) + >>>print(d.array) + [[0 1 2 -- 4]] + >>> e = d.sqrt() + >>> e + + >>> print(e.array) + [[0.0 1.0 1.4142135623730951 -- 2.0]] + + Negative input values raise a warning but nonetheless result in NaN + or, if there are already missing values, missing data: + + >>> import warnings + >>> d = cf.Data([0, 1, -4]) + >>> print(d.array) + [ 0 1 -4] + >>> with warnings.catch_warnings(): + ... warnings.simplefilter("ignore") + ... print(d.sqrt().array) + ... + [ 0. 1. nan] + + >>> d = cf.Data([0, 1, -4], mask=[1, 0, 0]) + >>> print(d.array) + [-- 1 -4] + >>> with warnings.catch_warnings(): + ... warnings.simplefilter("ignore") + ... print(d.sqrt().array) + ... + [-- 1.0 --] + + """ + d = _inplace_enabled_define_and_cleanup(self) + dx = d.to_dask_array() + dx = da.sqrt(dx, dtype=dtype) + d._set_dask(dx, reset_mask_hardness=False) + + units = d.Units + if units: + try: + d.override_units(units ** 0.5, inplace=True) + except ValueError as e: + raise type(e)( + f"Incompatible units for taking a square root: {units!r}" + ) + + return d + # ---------------------------------------------------------------- - # Alias + # Aliases # ---------------------------------------------------------------- @property def dtarray(self): """Alias for `datetime_array`""" return self.datetime_array - def max( + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def maximum( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Alias for `maximum`""" - return self.maximum( + """Alias for `max`""" + return self.max( axes=axes, squeeze=squeeze, mtol=mtol, + split_every=split_every, inplace=inplace, i=i, - _preserve_partitions=_preserve_partitions, ) - def min( + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def minimum( self, axes=None, squeeze=False, mtol=1, + split_every=None, + inplace=False, + i=False, + ): + """Alias for `min`""" + return self.min( + axes=axes, + squeeze=squeeze, + mtol=mtol, + split_every=split_every, + inplace=inplace, + i=i, + ) + + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def sd( + self, + axes=None, + squeeze=False, + mtol=1, + weights=None, + ddof=0, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Alias for `minimum`""" - return self.minimum( + """Alias for `std`""" + return self.std( axes=axes, squeeze=squeeze, + weights=weights, mtol=mtol, + ddof=ddof, + split_every=split_every, inplace=inplace, i=i, - _preserve_partitions=_preserve_partitions, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") def standard_deviation( self, axes=None, @@ -13541,21 +13137,25 @@ def standard_deviation( mtol=1, weights=None, ddof=0, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Alias for `sd`""" - return self.sd( + """Alias for `std`""" + return self.std( axes=axes, squeeze=squeeze, weights=weights, mtol=mtol, ddof=ddof, + split_every=split_every, inplace=inplace, - _preserve_partitions=_preserve_partitions, + i=i, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") def variance( self, axes=None, @@ -13563,9 +13163,9 @@ def variance( weights=None, mtol=1, ddof=0, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): """Alias for `var`""" return self.var( @@ -13574,14 +13174,12 @@ def variance( weights=weights, mtol=mtol, ddof=ddof, + split_every=split_every, inplace=inplace, - _preserve_partitions=_preserve_partitions, + i=i, ) -# --- End: class - - def _size_of_index(index, size=None): """Return the number of elements resulting in applying an index to a sequence. @@ -13620,109 +13218,6 @@ def _size_of_index(index, size=None): return len(index) -def _overlapping_partitions(partitions, indices, axes, master_flip): - """Return the nested list of (modified) partitions which overlap the - given indices to the master array. - - :Parameters: - - partitions : cf.PartitionMatrix - - indices : tuple - - axes : sequence of str - - master_flip : list - - :Returns: - - numpy array - A numpy array of cf.Partition objects. - - **Examples:** - - >>> type(f.Data) - - >>> d._axes - ['dim1', 'dim2', 'dim0'] - >>> axis_to_position = {'dim0': 2, 'dim1': 0, 'dim2' : 1} - >>> indices = (slice(None), slice(5, 1, -2), [1,3,4,8]) - >>> x = _overlapping_partitions(d.partitions, indices, axis_to_position, master_flip) - - """ - - axis_to_position = {} - for i, axis in enumerate(axes): - axis_to_position[axis] = i - - if partitions.size == 1: - partition = partitions.matrix.item() - - # Find out if this partition overlaps the original slice - p_indices, shape = partition.overlaps(indices) - - if p_indices is None: - # This partition is not in the slice out of bounds - raise - # error? - return - - # Still here? Create a new partition - partition = partition.copy() - partition.new_part(p_indices, axis_to_position, master_flip) - partition.shape = shape - - new_partition_matrix = np.empty(partitions.shape, dtype=object) - new_partition_matrix[...] = partition - - return new_partition_matrix - # --- End: if - - # Still here? Then there are 2 or more partitions. - - partitions_list = [] - partitions_list_append = partitions_list.append - - flat_pm_indices = [] - flat_pm_indices_append = flat_pm_indices.append - - partitions_flat = partitions.matrix.flat - - i = partitions_flat.index - - for partition in partitions_flat: - # Find out if this partition overlaps the original slice - p_indices, shape = partition.overlaps(indices) - - if p_indices is None: - # This partition is not in the slice - i = partitions_flat.index - continue - - # Still here? Then this partition overlaps the slice, so - # create a new partition. - partition = partition.copy() - partition.new_part(p_indices, axis_to_position, master_flip) - partition.shape = shape - - partitions_list_append(partition) - - flat_pm_indices_append(i) - - i = partitions_flat.index - # --- End: for - - new_shape = [ - len(set(s)) - for s in np.unravel_index(flat_pm_indices, partitions.shape) - ] - - new_partition_matrix = np.empty((len(flat_pm_indices),), dtype=object) - new_partition_matrix[...] = partitions_list - new_partition_matrix.resize(new_shape) - - return new_partition_matrix - - def _broadcast(a, shape): """Broadcast an array to a given shape. @@ -13805,3 +13300,252 @@ def _where_broadcastable(data, x, name): ) return True + + +def _collapse( + func, + d, + axis=None, + weights=None, + keepdims=True, + mtol=1, + ddof=None, + split_every=None, +): + """Collapse data in-place using a given funcion. + + .. versionadded:: TODODASK + + .. seealso:: `_parse_weights` + + :Parameters: + + func: callable + The function that collapses the underlying `dask` array of + *d*. Must have the minimum signature (parameters and + default values) ``func(dx, axis=None, keepdims=False, + mtol=None, split_every=None)`` (optionally including + ``weights=None`` or ``ddof=None``), where ``dx`` is a the + dask array contained in *d*. + + d: `Data` + The data to be collapsed. + + axis: (sequence of) int, optional + The axes to be collapsed. By default all axes are + collapsed, resulting in output with size 1. Each axis is + identified by its integer position. If *axes* is an empty + sequence then the collapse is applied to each scalar + element and the reuslt has the same shape as the input + data. + + weights: data_like, `dict`, or `None`, optional + Weights associated with values of the data. By default + *weights* is `None`, meaning that all non-missing elements + of the data have a weight of 1 and all missing elements + have a weight of 0. + + If *weights* is a data_like object then it must be + broadcastable to the array. + + If *weights* is a dictionary then each key specifies axes + of the data (an `int` or `tuple` of `int`), with a + corresponding value of data_like weights for those + axes. The dimensions of a weights value must correspond to + its key axes in the same order. Not all of the axes need + weights assigned to them. The weights that will be used + will be an outer product of the dictionary's values. + + However they are specified, the weights are internally + broadcast to the shape of the data, and those weights that + are missing data, or that correspond to the missing + elements of the data, are assigned a weight of 0. + + For collapse functions that do not have a ``weights`` + parameter, *weights* must be `None`. + + keepdims: `bool`, optional + By default, the axes which are collapsed are left in the + result as dimensions with size one, so that the result + will broadcast correctly against the input array. If set + to False then collapsed axes are removed from the data. + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. It is defined as a fraction (between + 0 and 1 inclusive) of the contributing input data values. + + The default of *mtol* is 1, meaning that a missing datum + in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + ddof: number, optional + The delta degrees of freedom. The number of degrees of + freedom used in the calculation is (N-*ddof*) where N + represents the number of non-missing elements. + + For collapse functions that do not have a ``ddof`` + parameter, *ddof* must be `None`. + + split_every: `int` or `dict`, optional + Determines the depth of the recursive aggregation. See + `dask.array.reduction` for details. + + :Returns: + + (`Data`, formatted weights) + The collapsed data and the output of ``_parse_weights(d, + weights, axis)``. + + """ + kwargs = { + "axis": axis, + "keepdims": keepdims, + "split_every": split_every, + "mtol": mtol, + } + + weights = _parse_weights(d, weights, axis) + if weights is not None: + kwargs["weights"] = weights + + if ddof is not None: + kwargs["ddof"] = ddof + + dx = d.to_dask_array() + dx = func(dx, **kwargs) + d._set_dask(dx, reset_mask_hardness=True) + + return d, weights + + +def _parse_weights(d, weights, axis=None): + """Parse the weights input to `_collapse`. + + .. versionadded:: TODODASK + + .. seealso:: `_collapse` + + :Parameters: + + d: `Data` + The data to be collapsed. + + weights: data_like or `dict` + See `_collapse` for details. + + axis: (sequence of) `int`, optional + See `_collapse` for details. + + :Returns: + + `Data` or `None` + * If *weights* is a data_like object then they are + returned unchanged as a `Data` object. It is up to the + downstream functions to check if the weights can be + broadcast to the data. + + * If *weights* is a dictionary then the dictionary + values', i.e. the weights components, outer product is + returned in `Data` object that is broadcastable to the + data. + + If the dictionary is empty, or none of the axes defined + by the keys correspond to collapse axes defined by + *axis*, then then the collapse is unweighted and `None` + is returned. + + Note that, in all cases, the returned weights are *not* + modified to account for missing values in the data. + + **Examples** + + >>> d = cf.Data(np.arange(12)).reshape(4, 3) + + >>> _parse_weights(d, [1, 2, 1], (0, 1)) + + + >>> _parse_weights(d, [[1, 2, 1]], (0, 1)) + + + >>> _parse_weights(d, {1: [1, 2, 1]}, (0, 1)) + + + >>> print(_parse_weights(d, {0: [1, 2, 3, 4], 1: [1, 2, 1]}, (0, 1))) + [[1 2 1] + [2 4 2] + [3 6 3] + [4 8 4]] + + >>> print(cf.data.data._parse_weights(d, {}, (0, 1))) + None + + >>> print(cf.data.data._parse_weights(d, {1: [1, 2, 1]}, 0)) + None + + """ + if weights is None: + # No weights + return + + if not isinstance(weights, dict): + # Weights is data_like. Don't check broadcastability to d, + # leave that to whatever uses the weights. + return Data.asdata(weights) + + if not weights: + # No weights (empty dictionary) + return + + if axis is None: + axis = tuple(range(d.ndim)) + else: + axis = d._parse_axes(axis) + + weights = weights.copy() + weights_axes = set() + for key, value in tuple(weights.items()): + del weights[key] + key = d._parse_axes(key) + if weights_axes.intersection(key): + raise ValueError("Duplicate weights axis") + + weights[tuple(key)] = value + weights_axes.update(key) + + if not weights_axes.intersection(axis): + # No weights span collapse axes + return + + # For each component, add missing dimensions as size 1. + w = [] + shape = d.shape + for key, value in weights.items(): + value = Data.asdata(value) + + # Make sure axes are in ascending order + skey = tuple(sorted(key)) + if key != skey: + value = value.transpose(skey) + key = skey + + if not all( + True if i in (j, 1) else False + for i, j in zip(value.shape, [shape[i] for i in key]) + ): + raise ValueError( + f"Weights component for axes {tuple(key)} with shape " + f"{value.shape} is not broadcastable to data with " + f"shape {shape}" + ) + + new_shape = [n if i in key else 1 for i, n in enumerate(shape)] + w.append(value.reshape(new_shape)) + + # Return the product of the weights components, which will be + # broadcastable to d + return reduce(mul, w) diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index 34c910c62c..bcb1921b2e 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -69,6 +69,31 @@ def __hash__(self): removed_at="5.0.0", ) + @property + def _HDF_chunks(self): + """The HDF chunksizes. + + Deprecated at version TODODASK. + + DO NOT CHANGE IN PLACE. + + """ + _DEPRECATION_ERROR_ATTRIBUTE( + self, "_HDF_chunks", version="TODODASK", removed_at="5.0.0" + ) # pragma: no cover + + @_HDF_chunks.setter + def _HDF_chunks(self, value): + _DEPRECATION_ERROR_ATTRIBUTE( + self, "_HDF_chunks", version="TODODASK", removed_at="5.0.0" + ) # pragma: no cover + + @_HDF_chunks.deleter + def _HDF_chunks(self): + _DEPRECATION_ERROR_ATTRIBUTE( + self, "_HDF_chunks", version="TODODASK", removed_at="5.0.0" + ) # pragma: no cover + @property def Data(self): """Deprecated at version 3.0.0, use attribute `data` instead.""" diff --git a/cf/data/utils.py b/cf/data/utils.py index d274b5230b..ef6e5f64ec 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -668,6 +668,8 @@ def YMDhms(d, attr): raise ValueError(f"Can't get {attr}s from data with {units!r}") d = d._asdatetime() - d._map_blocks(partial(cf_YMDhms, attr=attr), dtype=int) + dx = d.to_dask_array() + dx = dx.map_blocks(partial(cf_YMDhms, attr=attr), dtype=int) + d._set_dask(dx, reset_mask_hardness=False) d.override_units(Units(None), inplace=True) return d diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 188d454dbd..26537d7cec 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -28,14 +28,8 @@ # ---------------------------------------------------------------- # Class description susbstitutions (1 level of indentation) # ---------------------------------------------------------------- - "{{formula terms links}}": """See the parametric vertical coordinate sections of the CF - conventions for more details: - - `4.3.3. Parametric Vertical Coordinate - `_ - - `Appendix D: Parametric Vertical Coordinates - `_""", + "{{formula terms links}}": """See CF section 4.3.3 "Parametric Vertical Coordinate" and CF + Appendix D "Parametric Vertical Coordinates" for details.""", # ---------------------------------------------------------------- # Class description susbstitutions (1 level of indentation) # ---------------------------------------------------------------- @@ -245,6 +239,108 @@ domain axis. If the vertical axis does not appear in the computed non-parametric coodinates then this an empty tuple.""", + # collapse axes + "{{collapse axes: (sequence of) `int`, optional}}": """axes: (sequence of) `int`, optional + The axes to be collapsed. By default all axes are + collapsed, resulting in output with size 1. Each axis + is identified by its integer position. If *axes* is an + empty sequence then the collapse is applied to each + scalar element and the result has the same shape as + the input data.""", + # collapse squeeze + "{{collapse squeeze: `bool`, optional}}": """squeeze: `bool`, optional + By default, the axes which are collapsed are left in + the result as dimensions with size one, so that the + result will broadcast correctly against the input + array. If set to True then collapsed axes are removed + from the data.""", + # collapse keepdims + "{{collapse keepdims: `bool`, optional}}": """keepdims: `bool`, optional + By default, the axes which are collapsed are left in + the result as dimensions with size one, so that the + result will broadcast correctly against the input + array. If set to False then collapsed axes are removed + from the data.""", + # weights + "{{weights: data_like, `dict`, or `None`, optional}}": """weights: data_like, `dict`, or `None`, optional + Weights associated with values of the data. By default + *weights* is `None`, meaning that all non-missing + elements of the data have a weight of 1 and all + missing elements have a weight of 0. + + If *weights* is a data_like object then it must be + broadcastable to the array. + + If *weights* is a dictionary then each key specifies + axes of the data (an `int` or `tuple` of `int`), with + a corresponding value of data_like weights for those + axes. The dimensions of a weights value must + correspond to its key axes in the same order. Not all + of the axes need weights assigned to them. The weights + that will be used will be an outer product of the + dictionary's values. + + However they are specified, the weights are internally + broadcast to the shape of the data, and those weights + that are missing data, or that correspond to the + missing elements of the data, are assigned a weight of + 0.""", + # collapse mtol + "{{mtol: number, optional}}": """mtol: number, optional + The sample size threshold below which collapsed values + are set to missing data. It is defined as a fraction + (between 0 and 1 inclusive) of the contributing input + data values. + + The default of *mtol* is 1, meaning that a missing + datum in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the + input data.""", + # ddof + "{{ddof: number}}": """ddof: number + The delta degrees of freedom, a non-negative + number. The number of degrees of freedom used in the + calculation is (N-*ddof*) where N represents the + number of non-missing elements. A value of 1 applies + Bessel's correction. If the calculation is weighted + then *ddof* can only be 0 or 1.""", + # split_every + "{{split_every: `int` or `dict`, optional}}": """split_every: `int` or `dict`, optional + Determines the depth of the recursive aggregation. If + set to or more than the number of input chunks, the + aggregation will be performed in two steps, one + partial collapse per input chunk and a single + aggregation at the end. If set to less than that, an + intermediate aggregation step will be used, so that + any of the intermediate or final aggregation steps + operates on no more than ``split_every`` inputs. The + depth of the aggregation graph will be + :math:`log_{split_every}(input chunks along reduced + axes)`. Setting to a low value can reduce cache size + and network transfers, at the cost of more CPU and a + larger dask graph. + + By default, `dask` heuristically decides on a good + value. A default can also be set globally with the + ``split_every`` key in `dask.config`. See + `dask.array.reduction` for details.""", + # Collapse weights + "{{Collapse weights: data_like or `None`, optional}}": """weights: data_like or `None`, optional + Weights associated with values of the array. By + default *weights* is `None`, meaning that all + non-missing elements of the array are assumed to have + a weight equal to one. + + When *weights* is a data_like object then it must have + the same shape as the array.""", # ---------------------------------------------------------------- # Method description susbstitutions (4 levels of indentataion) # ---------------------------------------------------------------- diff --git a/cf/formula_terms.py b/cf/formula_terms.py index 8ecb993ecf..bf1bdd1214 100644 --- a/cf/formula_terms.py +++ b/cf/formula_terms.py @@ -1,6 +1,6 @@ import logging -import cfdm +from cfdm.core import DocstringRewriteMeta from .constants import ( formula_terms_computed_standard_names, @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) -class FormulaTerms(metaclass=cfdm.core.DocstringRewriteMeta): +class FormulaTerms(metaclass=DocstringRewriteMeta): """Functions for computing non-parametric vertical coordinates from the formula defined by a coordinate reference construct. @@ -64,13 +64,13 @@ def __docstring_substitutions__(self): return _docstring_substitution_definitions def __docstring_package_depth__(self): - """Return the package depth for {{package}} docstring + """Return the package depth for "package" docstring substitutions. See `_docstring_package_depth` for details. """ - return 1 + return 0 # ---------------------------------------------------------------- # Private methods @@ -225,8 +225,6 @@ def _computed_standard_name(f, standard_name, coordinate_reference): """Find the standard name of the computed non-parametric vertical coordinates. - {{formula terms links}} - .. versionadded:: 3.8.0 :Parameters: @@ -558,8 +556,6 @@ def _check_standard_name_consistency( s-coordinate, the ocean_sigma over z coordinate, and the ocean double sigma coordinate. - {{formula terms links}} - .. versionadded:: 3.8.0 :Parameters: @@ -680,8 +676,6 @@ def atmosphere_ln_pressure_coordinate( """Compute non-parametric vertical coordinates from atmosphere_ln_pressure_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -761,8 +755,6 @@ def atmosphere_sigma_coordinate( """Compute non-parametric vertical coordinates from atmosphere_sigma_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -858,8 +850,6 @@ def atmosphere_hybrid_sigma_pressure_coordinate( atmosphere_hybrid_sigma_pressure_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -989,8 +979,6 @@ def atmosphere_hybrid_height_coordinate( """Compute non-parametric vertical coordinates from atmosphere_hybrid_height_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1087,8 +1075,6 @@ def atmosphere_sleve_coordinate( """Compute non-parametric vertical coordinates from atmosphere_sleve_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1242,8 +1228,6 @@ def ocean_sigma_coordinate( """Compute non-parametric vertical coordinates from ocean_sigma_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1347,8 +1331,6 @@ def ocean_s_coordinate( """Compute non-parametric vertical coordinates from ocean_s_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1489,8 +1471,6 @@ def ocean_s_coordinate_g1( """Compute non-parametric vertical coordinates from ocean_s_coordinate_g1 parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1618,8 +1598,6 @@ def ocean_s_coordinate_g2( """Compute non-parametric vertical coordinates from ocean_s_coordinate_g2 parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1747,8 +1725,6 @@ def ocean_sigma_z_coordinate( """Compute non-parametric vertical coordinates from ocean_sigma_z_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1897,8 +1873,6 @@ def ocean_double_sigma_coordinate( """Compute non-parametric vertical coordinates from ocean_double_sigma_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -2051,8 +2025,6 @@ def formula( ): """Compute non-parametric vertical coordinates. - {{formula terms links}} - Dimensional vertical auxiliary coordinate values are computed from parametric vertical coordinate values (usually dimensionless) and associated domain ancillary constructs, as defined by the formula diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 8dc3f97f6b..2aed0e1ccd 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -43,6 +43,7 @@ ma[0, 3, :, 3] = np.ma.masked ma[1, 2, 3, :] = np.ma.masked +mw = np.ma.array(w, mask=ma.mask) # If True, all tests that will not pass temporarily due to the LAMA-to-Dask # migration will be skipped. These skips will be incrementally removed as the @@ -60,14 +61,23 @@ def reshape_array(a, axes): return b -class DataTest(unittest.TestCase): - - axes_combinations = [ +def axis_combinations(a): + return [ axes for n in range(1, a.ndim + 1) for axes in itertools.combinations(range(a.ndim), n) ] + +class DataTest(unittest.TestCase): + + axes_combinations = axis_combinations(a) + # [ + # axes + # for n in range(1, a.ndim + 1) + # for axes in itertools.combinations(range(a.ndim), n) + # ] + filename = os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_file.nc" ) @@ -81,6 +91,7 @@ class DataTest(unittest.TestCase): a = a w = w ma = ma + mw = mw ones = ones test_only = [] @@ -706,16 +717,25 @@ def test_Data_compressed(self): d = cf.Data(self.ma, "km") self.assertTrue((self.ma.compressed() == d.compressed()).all()) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_shape'") + @unittest.skipIf(TEST_DASKIFIED_ONLY, "Needs __eq__") def test_Data_stats(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - d = cf.Data([[0, 1, 2], [3, -99, 5]], mask=[[0, 0, 0], [0, 1, 0]]) - - self.assertIsInstance(d.stats(), dict) - _ = d.stats(all=True) - _ = d.stats(mean_of_upper_decile=True, range=False) + d = cf.Data([1, 1]) + + self.assertEqual( + d.stats(sum=True, weights=1), + { + "minimum": 1, + "mean": 1.0, + "median": 1.0, + "maximum": 1, + "range": 0, + "mid_range": 1.0, + "standard_deviation": 0.0, + "root_mean_square": 1.0, + "sum": 2, + "sample_size": 2, + }, + ) @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_shape'") def test_Data__init__dtype_mask(self): @@ -1818,58 +1838,22 @@ def test_Data_flip(self): self.assertEqual(d[0].shape, (1, 4, 5)) self.assertEqual(d[-1].shape, (1, 4, 5)) - self.assertEqual(d[0].maximum(), 4 * 5) - self.assertEqual(d[-1].maximum(), 3 * 4 * 5) + self.assertEqual(d[0].max().array, 4 * 5) + self.assertEqual(d[-1].max().array, 3 * 4 * 5) for i in (2, 1): e = d.flip(i) self.assertEqual(e[0].shape, (1, 4, 5)) self.assertEqual(e[-1].shape, (1, 4, 5)) - self.assertEqual(e[0].maximum(), 4 * 5) - self.assertEqual(e[-1].maximum(), 3 * 4 * 5) + self.assertEqual(e[0].max().array, 4 * 5) + self.assertEqual(e[-1].max().array, 3 * 4 * 5) i = 0 e = d.flip(i) self.assertEqual(e[0].shape, (1, 4, 5)) self.assertEqual(e[-1].shape, (1, 4, 5)) - self.assertEqual(e[0].maximum(), 3 * 4 * 5) - self.assertEqual(e[-1].maximum(), 4 * 5) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute 'datum'") - def test_Data_max(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (False, True): - d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") - self.assertEqual( - d.maximum(_preserve_partitions=pp), cf.Data(6, "metre") - ) - self.assertEqual(d.maximum(_preserve_partitions=pp).datum(), 6) - d[0, 2] = cf.masked - self.assertEqual(d.maximum(_preserve_partitions=pp), 5) - self.assertEqual(d.maximum(_preserve_partitions=pp).datum(), 5) - self.assertEqual( - d.maximum(_preserve_partitions=pp), cf.Data(0.005, "km") - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_min(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (False, True): - d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") - self.assertEqual( - d.minimum(_preserve_partitions=pp), cf.Data(1, "metre") - ) - self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 1) - d[1, 0] = cf.masked - self.assertEqual(d.minimum(_preserve_partitions=pp), 2) - self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 2) - self.assertEqual( - d.minimum(_preserve_partitions=pp), cf.Data(0.002, "km") - ) + self.assertEqual(e[0].max().array, 3 * 4 * 5) + self.assertEqual(e[-1].max().array, 4 * 5) def test_Data_ndindex(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: @@ -2394,154 +2378,6 @@ def test_Data_argmax(self): with self.assertRaises(Exception): d.argmax(axis=d.ndim) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits 'NoneType' is not iterable") - def test_Data__collapse_SHAPE(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - a = np.arange(-100, 200.0, dtype=float).reshape(3, 4, 5, 5) - - for h in ( - "sample_size", - "sum", - "min", - "max", - "mean", - "var", - "sd", - "mid_range", - "range", - "integral", - "maximum_absolute_value", - "minimum_absolute_value", - "sum_of_squares", - "root_mean_square", - "mean_absolute_value", - "median", - "mean_of_upper_decile", - "sum_of_weights", - "sum_of_weights2", - ): - - d = cf.Data(a[(slice(None, None, -1),) * a.ndim].copy()) - d.flip(inplace=True) - _ = cf.Data(self.w.copy()) - - shape = list(d.shape) - - for axes in self.axes_combinations: - e = getattr(d, h)( - axes=axes, squeeze=False, _preserve_partitions=False - ) - - shape = list(d.shape) - for i in axes: - shape[i] = 1 - - shape = tuple(shape) - self.assertEqual( - e.shape, - shape, - "{}, axes={}, not squeezed bad shape: {} != {}".format( - h, axes, e.shape, shape - ), - ) - - for axes in self.axes_combinations: - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=False - ) - shape = list(d.shape) - for i in sorted(axes, reverse=True): - shape.pop(i) - - shape = tuple(shape) - self.assertEqual( - e.shape, - shape, - "{}, axes={}, squeezed bad shape: {} != {}".format( - h, axes, e.shape, shape - ), - ) - - e = getattr(d, h)(squeeze=True, _preserve_partitions=False) - shape = () - self.assertEqual( - e.shape, - shape, - "{}, axes={}, squeezed bad shape: {} != {}".format( - h, None, e.shape, shape - ), - ) - - e = getattr(d, h)(squeeze=False, _preserve_partitions=False) - shape = (1,) * d.ndim - self.assertEqual( - e.shape, - shape, - "{}, axes={}, not squeezed bad shape: {} != {}".format( - h, None, e.shape, shape - ), - ) - # --- End: for - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_max_min_sum_sum_of_squares(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for _np, h in zip( - (np.sum, np.amin, np.amax, np.sum), - ("sum", "min", "max", "sum_of_squares"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - if h == "sum_of_squares": - b = b ** 2 - - b = _np(b, axis=-1) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) - - # unweighted, masked - d = cf.Data(self.ma) - for _np, h in zip( - (np.ma.sum, np.ma.amin, np.ma.amax, np.ma.sum), - ("sum", "min", "max", "sum_of_squares"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - if h == "sum_of_squares": - b = b ** 2 - - b = _np(b, axis=-1) - b = np.ma.asanyarray(b) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, \nb.mask={}".format( - h, axes, e.mask.array, b.mask - ), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) - def test_Data_percentile_median(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -2665,632 +2501,6 @@ def test_Data_percentile_median(self): with self.assertRaises(ValueError): d.percentile(q).array - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") - def test_Data_mean_of_upper_decile(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - p = np.percentile(b, 90, axis=-1, keepdims=True) - b = np.ma.where(b < p, np.ma.masked, b) - b = np.average(b, axis=-1) - - e = d.mean_of_upper_decile( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "mean_of_upper_decile, axis={}, unweighted, " - "unmasked \ne={}, \nb={}".format(axes, e.array, b), - ) - - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - b = np.ma.filled(b, np.nan) - with np.testing.suppress_warnings() as sup: - sup.filter( - RuntimeWarning, message=".*All-NaN slice encountered" - ) - p = np.nanpercentile(b, 90, axis=-1, keepdims=True) - - b = np.ma.masked_where(np.isnan(b), b, copy=False) - - p = np.where(np.isnan(p), b.max() + 1, p) - - with np.testing.suppress_warnings() as sup: - sup.filter( - RuntimeWarning, - message=".*invalid value encountered in less", - ) - b = np.ma.where(b < p, np.ma.masked, b) - - b = np.ma.average(b, axis=-1) - b = np.ma.asanyarray(b) - - e = d.mean_of_upper_decile( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "mean_of_upper_decile, axis={}, \ne.mask={}, " - "\nb.mask={}".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "mean_of_upper_decile, axis={}, " - "unweighted, masked " - "\ne={}, \nb={}".format(axes, e.array, b), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_range_mid_range(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for h in ("range", "mid_range"): - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - mn = np.amin(b, axis=-1) - mx = np.amax(b, axis=-1) - if h == "range": - b = mx - mn - elif h == "mid_range": - b = (mx + mn) * 0.5 - - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) - - # unweighted, masked - d = cf.Data(self.ma) - for h in ("range", "mid_range"): - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - mn = np.amin(b, axis=-1) - mx = np.amax(b, axis=-1) - if h == "range": - b = mx - mn - elif h == "mid_range": - b = (mx + mn) * 0.5 - - b = np.ma.asanyarray(b) - - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}".format(h, axes, e.mask.array, b.mask), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute 'w' for DataTest") - def test_Data_integral(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unmasked - d = cf.Data(self.a) - x = cf.Data(self.w) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - v = reshape_array(self.w, axes) - b = np.sum(b * v, axis=-1) - - e = d.integral( - axes=axes, squeeze=True, weights=x, _preserve_partitions=pp - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unmasked \ne={}, \nb={}".format( - axes, e.array, b - ), - ) - - # masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - v = reshape_array(self.w, axes) - b = np.sum(b * v, axis=-1) - b = np.ma.asanyarray(b) - - e = d.integral( - axes=axes, squeeze=True, weights=x, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={} masked, \ne.mask={}, " - "\nb.mask={}".format(axes, e.mask.array, b.mask), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, masked \ne={}, \nb={}".format(axes, e.array, b), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_sum_of_weights_sum_of_weights2(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for h in ("sum_of_weights", "sum_of_weights2"): - for axes in self.axes_combinations: - b = reshape_array(self.ones, axes) - b = b.sum(axis=-1) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked, pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for a, h in zip( - (self.mones, self.mones), ("sum_of_weights", "sum_of_weights2") - ): - for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = np.ma.asanyarray(b.sum(axis=-1)) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne.mask={}, \nb.mask={}".format( - h, axes, pp, e.mask.array, b.mask - ), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # weighted, masked - d = cf.Data(self.ma) - x = cf.Data(self.w) - for a, h in zip( - (self.mw, self.mw * self.mw), - ("sum_of_weights", "sum_of_weights2"), - ): - for axes in self.axes_combinations: - a = a.copy() - a.mask = self.ma.mask - b = reshape_array(a, axes) - b = np.ma.asanyarray(b.sum(axis=-1)) - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - _preserve_partitions=pp, - ) - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}".format(h, axes, e.mask.array, b.mask), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, \ne={}, \nb={}".format( - h, axes, e.array, b - ), - ) - # --- End: for - - # weighted, unmasked - d = cf.Data(self.a) - for a, h in zip( - (self.w, self.w * self.w), - ("sum_of_weights", "sum_of_weights2"), - ): - for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = b.sum(axis=-1) - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, \ne={}, \nb={}".format( - h, axes, e.array, b - ), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_mean_mean_absolute_value(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for absolute in (False, True): - a = self.a - ma = self.ma - method = "mean" - if absolute: - a = np.absolute(a) - ma = np.absolute(ma) - method = "mean_absolute_value" - - # unweighted, unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = np.mean(b, axis=-1) - e = getattr(d, method)(axes=axes, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} axis={}, unweighted, unmasked \ne={}, " - "\nb={}".format(method, axes, e.array, b), - ) - # --- End: for - - # weighted, unmasked - x = cf.Data(self.w) - for axes in self.axes_combinations: - b = reshape_array(a, axes) - v = reshape_array(self.w, axes) - b = np.average(b, axis=-1, weights=v) - - e = getattr(d, method)(axes=axes, weights=x, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} weighted, unmasked axis={}, \ne={}, " - "\nb={}".format(method, axes, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(ma, axes) - b = np.ma.average(b, axis=-1) - b = np.ma.asanyarray(b) - - e = getattr(d, method)(axes=axes, squeeze=True) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{} unweighted, masked axis={}, \ne.mask={}, " - "\nb.mask={}".format(method, axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} unweighted, masked axis={}, \ne={}, " - "\nb={}, ".format(method, axes, e.array, b), - ) - # --- End: for - - # weighted, masked - for axes in self.axes_combinations: - b = reshape_array(ma, axes) - v = reshape_array(self.mw, axes) - b = np.ma.average(b, axis=-1, weights=v) - b = np.ma.asanyarray(b) - - e = getattr(d, method)(axes=axes, weights=x, squeeze=True) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{} weighted, masked axis={}, \ne.mask={}, " - "\nb.mask={}".format(method, axes, e.mask.array, b.mask), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} weighted, masked axis={}, \ne={}, " - "\nb={}, ".format(method, axes, e.array, b), - ) - # --- End: for - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_root_mean_square(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - # unweighted, unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) ** 2 - b = np.mean(b, axis=-1) ** 0.5 - e = d.root_mean_square(axes=axes, squeeze=True) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unweighted, unmasked \ne={}, " - "\nb={}".format(axes, e.array, b), - ) - # --- End: for - - # weighted, unmasked - x = cf.Data(self.w) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) ** 2 - v = reshape_array(self.w, axes) - b = np.average(b, axis=-1, weights=v) ** 0.5 - - e = d.root_mean_square(axes=axes, weights=x, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, weighted, unmasked \ne={}, " - "\nb={}".format(axes, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) ** 2 - b = np.ma.average(b, axis=-1) - b = np.ma.asanyarray(b) ** 0.5 - - e = d.root_mean_square(axes=axes, squeeze=True) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={}, unweighted, masked \ne.mask={}, " - "\nb.mask={}, ".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unweighted, masked \ne={}, " - "\nb={}, ".format(axes, e.array, b), - ) - # --- End: for - - # weighted, masked - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) ** 2 - v = reshape_array(self.mw, axes) - b = np.ma.average(b, axis=-1, weights=v) - b = np.ma.asanyarray(b) ** 0.5 - - e = d.root_mean_square(axes=axes, weights=x, squeeze=True) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={}, weighted, masked \ne.mask={}, " - "\nb.mask={}, ".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, weighted, masked \ne={}, \nb={}, ".format( - axes, e.array, b - ), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_sample_size(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - # unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.ones, axes) - b = b.sum(axis=-1) - e = d.sample_size(axes=axes, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, \ne={}, \nb={}".format(axes, e.array, b), - ) - # --- End: for - - # masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.mones, axes) - b = b.sum(axis=-1) - e = d.sample_size(axes=axes, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, \ne={}, \nb={}".format(axes, e.array, b), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'axes_combinations'") - def test_Data_sd_var(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - ddofs = (0, 1) - - for pp in (False, True): - # unweighted, unmasked - d = cf.Data(self.a, units="K") - for _np, h in zip((np.var, np.std), ("var", "sd")): - for ddof in ddofs: - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - b = _np(b, axis=-1, ddof=ddof) - e = getattr(d, h)( - axes=axes, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma, units="K") - for _np, h in zip((np.ma.var, np.ma.std), ("var", "sd")): - for ddof in ddofs: - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - b = _np(b, axis=-1, ddof=ddof) - e = getattr(d, h)( - axes=axes, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # weighted, unmasked - d = cf.Data(self.a, units="K") - x = cf.Data(self.w) - for h in ("var", "sd"): - for axes in self.axes_combinations: - for ddof in (0, 1): - b = reshape_array(self.a, axes) - v = reshape_array(self.w, axes) - - avg = np.average(b, axis=-1, weights=v) - if np.ndim(avg) < b.ndim: - avg = np.expand_dims(avg, -1) - - b, V1 = np.average( - (b - avg) ** 2, axis=-1, weights=v, returned=True - ) - - if ddof == 1: - # Calculate the weighted unbiased - # variance. The unbiased variance - # weighted with _reliability_ weights - # is [V1**2/(V1**2-V2)]*var. - V2 = np.asanyarray((v * v).sum(axis=-1)) - b *= V1 * V1 / (V1 * V1 - V2) - elif ddof == 0: - pass - - if h == "sd": - b **= 0.5 - - b = np.ma.asanyarray(b) - - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, weighted, unmasked, pp={}, " - "ddof={}, \ne={}, \nb={}".format( - h, axes, pp, ddof, e.array, b - ), - ) - # --- End: for - - # weighted, masked - d = cf.Data(self.ma, units="K") - x = cf.Data(self.w) - for h in ("var", "sd"): - for axes in self.axes_combinations: - for ddof in (0, 1): - b = reshape_array(self.ma, axes) - v = reshape_array(self.mw, axes) - - not_enough_data = np.ma.count(b, axis=-1) <= ddof - - avg = np.ma.average(b, axis=-1, weights=v) - if np.ndim(avg) < b.ndim: - avg = np.expand_dims(avg, -1) - - b, V1 = np.ma.average( - (b - avg) ** 2, axis=-1, weights=v, returned=True - ) - - b = np.ma.where(not_enough_data, np.ma.masked, b) - - if ddof == 1: - # Calculate the weighted unbiased - # variance. The unbiased variance - # weighted with _reliability_ weights - # is [V1**2/(V1**2-V2)]*var. - V2 = np.asanyarray((v * v).sum(axis=-1)) - b *= V1 * V1 / (V1 * V1 - V2) - elif ddof == 0: - pass - - if h == "sd": - b **= 0.5 - - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - - if h == "sd": - self.assertEqual(e.Units, d.Units) - else: - self.assertEqual(e.Units, d.Units ** 2) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}, ".format( - h, axes, e.mask.array, b.mask - ), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, weighted, masked, pp={}, " - "ddof={}, \ne={}, \nb={}".format( - h, axes, pp, ddof, e.array, b - ), - ) - # --- End: for - @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits unexpected kwarg 'select'") def test_Data_dumpd_loadd_dumps(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: @@ -3919,6 +3129,690 @@ def test_Data_rechunk(self): self.assertEqual(e.chunks, ((4,), (5,))) self.assertTrue(e.equals(d)) + def test_Data_reshape(self): + a = self.ma + d = cf.Data(a) + + self.assertIsNone(d.reshape(*d.shape, inplace=True)) + self.assertEqual(d.shape, a.shape) + + for original_shape, new_shape, chunks in ( + ((10,), (10,), (3, 3, 4)), + ((10,), (10, 1, 1), 5), + ((10,), (1, 10), 5), + ((24,), (2, 3, 4), 12), + ((1, 24), (2, 3, 4), 12), + ((2, 3, 4), (24,), (1, 3, 4)), + ((2, 3, 4), (24,), 4), + ((2, 3, 4), (24, 1), 4), + ((2, 3, 4), (1, 24), 4), + ((4, 4, 1), (4, 4), 2), + ((4, 4), (4, 4, 1), 2), + ((1, 4, 4), (4, 4), 2), + ((1, 4, 4), (4, 4, 1), 2), + ((1, 4, 4), (1, 1, 4, 4), 2), + ((4, 4), (1, 4, 4, 1), 2), + ((4, 4), (1, 4, 4), 2), + ((2, 3), (2, 3), (1, 2)), + ((2, 3), (3, 2), 3), + ((4, 2, 3), (4, 6), 4), + ((3, 4, 5, 6), (3, 4, 5, 6), (2, 3, 4, 5)), + ((), (1,), 1), + ((1,), (), 1), + ((24,), (3, 8), 24), + ((24,), (4, 6), 6), + ((24,), (4, 3, 2), 6), + ((24,), (4, 6, 1), 6), + ((24,), (4, 6), (6, 12, 6)), + ((64, 4), (8, 8, 4), (16, 2)), + ((4, 64), (4, 8, 4, 2), (2, 16)), + ((4, 8, 4, 2), (2, 1, 2, 32, 2), (2, 4, 2, 2)), + ((4, 1, 4), (4, 4), (2, 1, 2)), + ((0, 10), (0, 5, 2), (5, 5)), + ((5, 0, 2), (0, 10), (5, 2, 2)), + ((0,), (2, 0, 2), (4,)), + ((2, 0, 2), (0,), (4, 4, 4)), + ((2, 3, 4), -1, -1), + ): + a = np.random.randint(10, size=original_shape) + d = cf.Data(a, chunks=chunks) + + a = a.reshape(new_shape) + d = d.reshape(new_shape) + + self.assertEqual(d.shape, a.shape) + self.assertTrue((d.array == a).all()) + + def test_Data_square(self): + a = self.ma.astype(float) + asquare = np.square(a) + + d = cf.Data(a) + self.assertIsNone(d.square(inplace=True)) + self.assertTrue((d.array == asquare).all()) + self.assertEqual(d.Units, cf.Units()) + + d = cf.Data(a, "m") + e = d.square() + self.assertEqual(e.dtype, asquare.dtype) + self.assertTrue((e.array == asquare).all()) + self.assertEqual(e.Units, cf.Units("m2")) + + asquare = np.square(a, dtype="float32") + e = d.square(dtype="float32") + self.assertEqual(e.dtype, asquare.dtype) + self.assertTrue((e.array == asquare).all()) + + def test_Data_sqrt(self): + a = self.ma.astype(float) + asqrt = np.sqrt(a) + + d = cf.Data(a) + self.assertIsNone(d.sqrt(inplace=True)) + self.assertTrue((d.array == asqrt).all()) + self.assertEqual(d.Units, cf.Units()) + + d = cf.Data(a, "m2") + e = d.sqrt() + self.assertEqual(e.dtype, asqrt.dtype) + self.assertTrue((e.array == asqrt).all()) + self.assertEqual(e.Units, cf.Units("m")) + + asqrt = np.sqrt(a, dtype="float32") + e = d.sqrt(dtype="float32") + self.assertEqual(e.dtype, asqrt.dtype) + self.assertTrue((e.array == asqrt).all()) + + # Incompatible units + d = cf.Data(a, "m") + with self.assertRaises(ValueError): + d.sqrt() + + def test_Data_integral(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.sum(b * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.integral(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_max(self): + # Masked array + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.max(b, axis=-1) + b = np.ma.asanyarray(b) + + e = d.max(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_maximum_absolute_value(self): + # Masked array + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.max(abs(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.maximum_absolute_value(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_mean(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.average(b, axis=-1, weights=w) + b = np.ma.asanyarray(b) + + e = d.mean(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_mean_absolute_value(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.average(abs(b), axis=-1, weights=w) + b = np.ma.asanyarray(b) + + e = d.mean_absolute_value(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_mid_range(self): + # Masked array, non-masked weights + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = (np.max(b, axis=-1) + np.min(b, axis=-1)) / 2.0 + b = np.ma.asanyarray(b) + + e = d.mid_range(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + with self.assertRaises(TypeError): + cf.Data([0, 1], dtype=bool).mid_range() + + def test_Data_min(self): + # Masked array + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.min(b, axis=-1) + b = np.ma.asanyarray(b) + + e = d.min(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_minimum_absolute_value(self): + # Masked array + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.min(abs(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.minimum_absolute_value(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_range(self): + # Masked array + a = self.ma + + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.max(b, axis=-1) - np.min(b, axis=-1) + b = np.ma.asanyarray(b) + + e = d.range(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + with self.assertRaises(TypeError): + cf.Data([0, 1], dtype=bool).range() + + def test_Data_root_mean_square(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.average(b * b, axis=-1, weights=w) ** 0.5 + b = np.ma.asanyarray(b) + + e = d.root_mean_square(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sample_size(self): + # Masked array + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.sum(np.ones_like(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.sample_size(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + # Non-masked array + a = self.a + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.sum(np.ones_like(b), axis=-1) + b = np.asanyarray(b) + + e = d.sample_size(axes=axis, squeeze=True) + e = np.array(e.array) + + self.assertTrue(np.allclose(e, b)) + + def test_Data_std(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + std = d.std(weights=weights, ddof=1) + var = d.var(weights=weights, ddof=1) + + self.assertTrue(std.equals(var.sqrt())) + + def test_Data_sum(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.sum(b * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sum_of_squares(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.sum(b * b * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_squares(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sum_of_weights(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + # Weights=None + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.sum(np.ones_like(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_weights(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + w = np.ma.masked_where(b.mask, w) + b = np.sum(w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_weights(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sum_of_weights2(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + # Weights=None + for axis in axis_combinations(a): + e = d.sum_of_weights2(axes=axis) + f = d.sum_of_weights(axes=axis) + self.assertTrue(e.equals(f)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + w = np.ma.masked_where(b.mask, w) + b = np.sum(w * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_weights2(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_var(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + # Weighted ddof = 0 + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + mu, V1 = np.ma.average(b, axis=-1, weights=w, returned=True) + mu = mu.reshape(mu.shape + (1,)) + w = np.ma.masked_where(b.mask, w) + + b = np.sum(w * (b - mu) ** 2, axis=-1) + b = b / V1 + b = np.ma.asanyarray(b) + + e = d.var(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b), f"e={e}\nb={b}\ne-b={e-b}") + + # Weighted ddof = 1 + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + mu, V1 = np.ma.average(b, axis=-1, weights=w, returned=True) + mu = mu.reshape(mu.shape + (1,)) + w = np.ma.masked_where(b.mask, w) + V2 = np.sum(w * w, axis=-1) + + b = np.sum(w * (b - mu) ** 2, axis=-1) + b = b / (V1 - (V2 / V1)) + b = np.ma.asanyarray(b) + + e = d.var(axes=axis, weights=weights, ddof=1, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + # Unweighted ddof = 1 + for axis in axis_combinations(a): + b = reshape_array(a, axis) + mu, V1 = np.ma.average(b, axis=-1, returned=True) + mu = mu.reshape(mu.shape + (1,)) + + b = np.sum((b - mu) ** 2, axis=-1) + b = b / (V1 - 1) + b = np.ma.asanyarray(b) + + e = d.var(axes=axis, ddof=1, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "Needs __lt__ and __le__") + def test_Data_mean_of_upper_decile(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.filled(b, np.nan) + with np.testing.suppress_warnings() as sup: + sup.filter( + RuntimeWarning, message=".*All-NaN slice encountered" + ) + p = np.nanpercentile(b, 90, axis=-1, keepdims=True) + + b = np.ma.masked_where(np.isnan(b), b, copy=False) + p = np.where(np.isnan(p), b.max() + 1, p) + + with np.testing.suppress_warnings() as sup: + sup.filter( + RuntimeWarning, + message=".*invalid value encountered in less", + ) + b = np.ma.where(b < p, np.ma.masked, b) + + b = np.ma.average(b, axis=-1, weights=w) + b = np.ma.asanyarray(b) + + e = d.mean_of_upper_decile( + axes=axis, weights=weights, squeeze=True + ) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_collapse_mtol(self): + # Data with exactly half of its elements masked + d = cf.Data(np.arange(6), "K", mask=[0, 1, 0, 1, 0, 1], chunks=2) + + for func in ( + d.integral, + d.mean, + d.mean_absolute_value, + d.median, + d.min, + d.mid_range, + d.minimum_absolute_value, + d.max, + d.maximum_absolute_value, + d.range, + d.root_mean_square, + d.sample_size, + d.std, + d.sum, + d.sum_of_squares, + d.sum_of_weights, + d.sum_of_weights2, + d.var, + ): + self.assertTrue(func(mtol=0.4).array.mask) + self.assertFalse(func(mtol=0.5).array.mask) + + # TODODASK - add in mean_of_upper_decile when it's daskified + + def test_Data_collapse_units(self): + d = cf.Data([1, 2], "K") + + self.assertEqual(d.sample_size().Units, cf.Units()) + + for func in ( + d.integral, + d.mean, + d.mean_absolute_value, + d.median, + d.min, + d.mid_range, + d.minimum_absolute_value, + d.max, + d.maximum_absolute_value, + d.range, + d.root_mean_square, + d.std, + d.sum, + ): + self.assertEqual(func().Units, d.Units) + + for func in ( + d.sum_of_squares, + d.var, + ): + self.assertEqual(func().Units, d.Units ** 2) + + for func in ( + d.sum_of_weights, + d.sum_of_weights2, + ): + self.assertEqual(func().Units, cf.Units()) + + # Weighted + w = cf.Data(1, "m") + self.assertEqual(d.integral(weights=w).Units, d.Units * w.Units) + self.assertEqual(d.sum_of_weights(weights=w).Units, w.Units) + self.assertEqual(d.sum_of_weights2(weights=w).Units, w.Units ** 2) + + # Dimensionless data + d = cf.Data([1, 2]) + self.assertEqual(d.integral(weights=w).Units, w.Units) + + for func in ( + d.sum_of_squares, + d.var, + ): + self.assertEqual(func().Units, cf.Units()) + + # TODODASK - add in mean_of_upper_decile when it's daskified + + def test_Data_collapse_keepdims(self): + d = cf.Data(np.arange(6).reshape(2, 3)) + + for func in ( + d.integral, + d.mean, + d.mean_absolute_value, + d.median, + d.min, + d.mid_range, + d.minimum_absolute_value, + d.max, + d.maximum_absolute_value, + d.range, + d.root_mean_square, + d.sample_size, + d.std, + d.sum, + d.sum_of_squares, + d.sum_of_weights, + d.sum_of_weights2, + d.var, + ): + for axis in axis_combinations(d): + e = func(axes=axis, squeeze=False) + s = [1 if i in axis else n for i, n in enumerate(d.shape)] + self.assertEqual(e.shape, tuple(s)) + + for axis in axis_combinations(d): + e = func(axes=axis, squeeze=True) + s = [n for i, n in enumerate(d.shape) if i not in axis] + self.assertEqual(e.shape, tuple(s)) + + # TODODASK - add in mean_of_upper_decile + + def test_Data_collapse_dtype(self): + d = cf.Data([1, 2, 3, 4], dtype="i4", chunks=2) + e = cf.Data([1.0, 2, 3, 4], dtype="f4", chunks=2) + self.assertTrue(d.dtype, "i4") + self.assertTrue(e.dtype, "f4") + + # Cases for which both d and e collapse to a result of the + # same data type + for x, r in zip((d, e), ("i4", "f4")): + for func in ( + x.min, + x.minimum_absolute_value, + x.max, + x.maximum_absolute_value, + x.range, + ): + self.assertEqual(func().dtype, r) + + # Cases for which both d and e collapse to a result of the + # double of same data type + for x, r in zip((d, e), ("i8", "f8")): + for func in ( + x.integral, + x.sum, + x.sum_of_squares, + ): + self.assertEqual(func().dtype, r) + + # Cases for which both d and e collapse to a result of double + # float data type + for x, r in zip((d, e), ("f8", "f8")): + for func in ( + x.mean, + x.mean_absolute_value, + x.median, + x.mid_range, + x.root_mean_square, + x.std, + x.var, + ): + self.assertEqual(func().dtype, r) + + x = d + for func in ( + x.sum_of_weights, + x.sum_of_weights2, + ): + self.assertEqual(func().dtype, "i8") + + # Weights + w_int = cf.Data(1, dtype="i4") + w_float = cf.Data(1.0, dtype="f4") + for w, r in zip((w_int, w_float), ("i8", "f8")): + for func in ( + d.integral, + d.sum, + d.sum_of_squares, + d.sum_of_weights, + d.sum_of_weights2, + ): + self.assertTrue(func(weights=w).dtype, r) + + # TODODASK - add in mean_of_upper_decile + def test_Data_get_units(self): for units in ("", "m", "days since 2000-01-01"): d = cf.Data(1, units) diff --git a/docs/Makefile b/docs/Makefile index e101c0f118..6850180e1d 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -2,7 +2,7 @@ # # You can set these variables from the command line. -SPHINXOPTS = -j 2 +SPHINXOPTS = -j 1 SPHINXBUILD = sphinx-build PAPER = #BUILDDIR = build diff --git a/docs/source/field_analysis.rst b/docs/source/field_analysis.rst index 3ea43360f5..fe2b323451 100644 --- a/docs/source/field_analysis.rst +++ b/docs/source/field_analysis.rst @@ -125,90 +125,123 @@ Collapse methods ^^^^^^^^^^^^^^^^ The following collapse methods are available, over any subset of the -domain axes. The "Cell method" column in the table gives the method of -the new cell method construct (if one is created). +domain axes. The "Cell method" column in the table gives the method +name, defined by the CF conventions, of the new cell method construct +(if one is created). ============================ ======================================== ========================== Method Description Cell method ============================ ======================================== ========================== -``'maximum'`` The maximum of the values. ``maximum`` +``'sample_size'`` The sample size, :math:`N`, equal to the ``point`` + number of non-missing values. + +``'maximum'`` The maximum of the non-missing values. ``maximum`` + + .. math:: m_x=\max\{x_0, \ldots, x_N\} -``'minimum'`` The minimum of the values. ``minimum`` +``'minimum'`` The minimum of the non-missing values. ``minimum`` + + .. math:: m_n=\min\{x_0, \ldots, x_N\} -``'maximum_absolute_value'`` The maximum of the absolute values. ``maximum_absolute_value`` +``'maximum_absolute_value'`` The maximum of the non-missing absolute ``maximum_absolute_value`` + values. + + .. math:: \max\{|x_0|, \ldots, |x_N|\} -``'minimum_absolute_value'`` The minimum of the absolute values. ``minimum_absolute_value`` +``'minimum_absolute_value'`` The minimum of the non-missing absolute ``minimum_absolute_value`` + values. + + .. math:: \min\{|x_0|, \ldots, |x_N|\} ``'mid_range'`` The average of the maximum and the ``mid_range`` - minimum of the values. + minimum of the non-missing values. + + .. math:: \frac{m_x + m_n}{2} ``'range'`` The absolute difference between the ``range`` - maximum and the minimum of the values. + maximum and the minimum of the + non-missing values. + + .. math:: m_x - m_n -``'median'`` The median of the values. ``median`` +``'median'`` The median of the :math:`N` non-missing ``median`` + values. -``'sample_size'`` The sample size, :math:`N`, as would be ``point`` - used for other calculations, i.e. the - number of non-missing values. - -``'sum_of_weights'`` The sum of :math:`N` weights ``sum`` - :math:`w_i`, as would be used for other - calculations, is +``'sum_of_weights'`` The sum of the :math:`N` weights ``sum`` + :math:`w_i` that correspond to + non-missing values. .. math:: V_{1}=\sum_{i=1}^{N} w_i -``'sum_of_weights2'`` The sum of the squares of :math:`N` ``sum`` - weights :math:`w_i`, as would be used - for other calculations, is +``'sum_of_weights2'`` The sum of the squares of the :math:`N` ``sum`` + weights :math:`w_i` that correspond to + non-missing values. - .. math:: V_{2}=\sum_{i=1}^{N} w_i^{2} + .. math:: V_{2}=\sum_{i=1}^{N} w_i^{2} -``'sum'`` The unweighted sum of :math:`N` values ``sum`` - :math:`x_i` is +``'sum'`` The unweighted sum of the :math:`N` ``sum`` + non-missing values :math:`x_i` is .. math:: t=\sum_{i=1}^{N} x_i -``'sum_of_squares'`` The unweighted sum of the squares of ``sum_of_squares`` - :math:`N` values :math:`x_i` is + The weighted sum of the :math:`N` + non-missing values :math:`x_i` with + corresponding weights :math:`w_i` is + + .. math:: \hat{t}=\sum_{i=1}^{N} w_i x_i + +``'sum_of_squares'`` The unweighted sum of the squares of the ``sum_of_squares`` + :math:`N` non-missing values :math:`x_i` + is .. math:: t_2=\sum_{i=1}^{N} x_{i}^{2} -``'integral'`` The integral of :math:`N` values ``sum`` - :math:`x_i` with corresponding cell - measures :math:`m_i` is + The weighted sum of the squares of the + :math:`N` non-missing values :math:`x_i` + with corresponding weights :math:`w_i` + is + + .. math:: \hat{t}_2=\sum_{i=1}^{N} + w_i x_{i}^{2} + +``'integral'`` The :ref:`weighted ` ``sum`` + sum of the :math:`N` non-missing values + :math:`x_i` with corresponding cell + measures :math:`m_i`. .. math:: i=\sum_{i=1}^{N} m_i x_i - Note that the integral differs from a - weighted sum in that the units of the - cell measures are incorporated into the - result. + .. note:: The integral differs from a + weighted sum in that the units + of the measures are + incorporated into the result. -``'mean'`` The unweighted mean of :math:`N` values ``mean`` - :math:`x_i` is +``'mean'`` The unweighted mean of the :math:`N` ``mean`` + non-missing values :math:`x_i` is .. math:: \mu=\frac{1}{N}\sum_{i=1}^{N} x_i The :ref:`weighted ` - mean of :math:`N` values :math:`x_i` - with corresponding weights :math:`w_i` - is + mean of the :math:`N` non-missing values + :math:`x_i` with corresponding weights + :math:`w_i`. is .. math:: \hat{\mu}=\frac{1}{V_{1}} \sum_{i=1}^{N} - w_i x_i + w_i x_i -``'mean_absolute_value'`` The unweighted mean of :math:`N` ``mean_absolute_value`` - values :math:`x_i` absoluted is +``'mean_absolute_value'`` The unweighted mean of the :math:`N` ``mean_absolute_value`` + non-missing absolute values :math:`x_i` + is .. math:: \mu_{abs}=\frac{1}{N} \sum_{i=1}^{N}|x_i| The :ref:`weighted ` - mean of :math:`N` values :math:`x_i` - absoluted with corresponding weights - :math:`w_i` is + mean of the :math:`N` non-missing + absolute values :math:`x_i` with + corresponding weights :math:`w_i` is .. math:: \hat{\mu}_{abs}= \frac{1}{V_{1}} @@ -218,8 +251,8 @@ Method Description Cell met upper group of data values defined by the upper tenth of their distribution -``'variance'`` The unweighted variance of :math:`N` ``variance`` - values :math:`x_i` and with +``'variance'`` The unweighted variance of the :math:`N` ``variance`` + non-missing values :math:`x_i` and with :math:`N-ddof` degrees of freedom (:math:`ddof\ge0`) is @@ -236,9 +269,10 @@ Method Description Cell met :math:`ddof=1`. The :ref:`weighted ` - biased estimate of the variance of - :math:`N` values :math:`x_i` with - corresponding weights :math:`w_i` is + biased estimate of the variance of the + :math:`N` non-missing values :math:`x_i` + with corresponding weights :math:`w_i` + is .. math:: \hat{s}_{N}^{2}= \frac{1}{V_{1}} @@ -246,9 +280,11 @@ Method Description Cell met w_i(x_i - \hat{\mu})^{2} - The corresponding :ref:`weighted - ` unbiased estimate of - the variance is + The :ref:`weighted ` + unbiased estimate of the variance of the + :math:`N` non-missing values :math:`x_i` + with corresponding weights :math:`w_i` + is .. math:: \hat{s}^{2}=\frac{1}{V_{1} - (V_{1}/V_{2})} @@ -256,28 +292,31 @@ Method Description Cell met w_i(x_i - \hat{\mu})^{2} - In both cases, the weights are assumed - to be non-random reliability weights, as - opposed to frequency weights. + .. note:: The weights used in the + variance calculations are + assumed to be non-random + reliability weights, as + opposed to frequency weights. ``'standard_deviation'`` The standard deviation is the square ``standard_deviation`` root of the unweighted or :ref:`weighted ` - variance, as defined in this table. + variance. -``'root_mean_square'`` The unweighted root mean square of ``root_mean_square`` - :math:`N` values :math:`x_i` is +``'root_mean_square'`` The unweighted root mean square of the ``root_mean_square`` + :math:`N` non-missing values + :math:`x_i` is - .. math:: RMS=\sqrt{\frac{1}{N} + .. math:: r=\sqrt{\frac{1}{N} \sum_{i=1}^{N} x_{i}^2} The :ref:`weighted ` - root mean square of :math:`N` values - :math:`x_i` with corresponding weights - :math:`w_i` is + root mean square of the :math:`N` + non-missing values :math:`x_i` with + corresponding weights :math:`w_i` is - .. math:: \hat{RMS}=\sqrt{ + .. math:: \hat{r}=\sqrt{ \frac{1}{V_{1}} \sum_{i=1}^{N} w_i x_{i}^2} diff --git a/requirements.txt b/requirements.txt index 4b7bb69d27..49aa387a59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ numpy>=1.22 cfdm>=1.9.1.0, <1.9.2.0 psutil>=0.6.0 cfunits>=3.3.4 +dask>=2022.03.0