From 20562adca53bde34ed91cd4faf14b1f77fb039a3 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 11 Mar 2022 18:30:30 +0000 Subject: [PATCH 01/37] dev --- cf/data/collapse_functions.py | 972 +++++++++++++++++++--------------- cf/data/dask_utils.py | 2 + cf/data/data.py | 289 ++++++---- 3 files changed, 713 insertions(+), 550 deletions(-) diff --git a/cf/data/collapse_functions.py b/cf/data/collapse_functions.py index 79017e4ace..c092d58d8a 100644 --- a/cf/data/collapse_functions.py +++ b/cf/data/collapse_functions.py @@ -1,5 +1,10 @@ +from functools import partial from functools import partial as functools_partial +from functools import reduce +from operator import mul +import numpy as np +from dask.array.reductions import reduction from numpy import abs as numpy_abs from numpy import amax as numpy_amax from numpy import amin as numpy_amin @@ -33,19 +38,18 @@ def asanyarray(*args): :Returns: - `tuple` + `list` The input objects left as, else converted to, `numpy.ndarray` """ out = [] for x in args: - if x is not None and not numpy_ndim(x): + if x is not None and not np.ndim(x): # Make sure that we have a numpy array (as opposed to, e.g. a # numpy.float64) - out.append(numpy_asanyarray(x)) + out.append(np.asanyarray(x)) else: out.append(x) - # --- End: for return out @@ -208,6 +212,7 @@ def mask_where_too_few_values(Nmin, N, x): strictly less than *Nmin*. """ + print(" N.min() =", N.min(), Nmin) if N.min() < Nmin: mask = N < Nmin N = numpy_ma_array(N, mask=mask, copy=False, shrink=False) @@ -216,6 +221,31 @@ def mask_where_too_few_values(Nmin, N, x): return asanyarray(N, x) +def mask_small_sample_size(x, N, axis, mtol, original_shape): + """Mask elements of N and x where N is strictly less than Nmin. + + :Parameters: + + Nmin: `int` + + N: `numpy.ndarray` + + x: `numpy.ndarray` + + :Returns: + + (`numpy.ndarray`, `numpy.ndarray`) + A tuple containing *N* and *x*, both masked where *N* is + strictly less than *Nmin*. + + """ + if mtol < 1: + Nmax = reduce(mul, [original_shape[i] for i in axis], 1) + x = np.ma.masked_where(N < (1 - mtol) * Nmax, x, copy=False) + + return x + + def double_precision(a): """Convert the input array to double precision. @@ -242,219 +272,6 @@ def double_precision(a): return a.astype(newtype, copy=False) -# -------------------------------------------------------------------- -# Maximum -# -------------------------------------------------------------------- -def max_f(a, axis=None, masked=False): - """Return the maximum of an array or maximum along a specified axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: `bool` - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the maximum. - - """ - (N,) = sample_size_f(a, axis=axis, masked=masked) - amax = numpy_amax(a, axis=axis) - - if not numpy_ndim(amax): - # Make sure that we have a numpy array (as opposed to, e.g. a - # numpy.float64) - amax = numpy_asanyarray(amax) - - return asanyarray(N, amax) - - -def max_fpartial(out, out1=None, group=False): - """Return the partial maximum of an array. - - :Parameters: - - out: 2-`tuple` of `numpy.ndarray` - - out1: 2-`tuple` of `numpy.ndarray`, optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the maximum. - - """ - N, amax = out - - if out1 is not None: - N1, amax1 = out1 - N = psum(N, N1) - amax = pmax(amax, amax1) - - return asanyarray(N, amax) - - -def max_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the maximum of an - array. - - Here mask out any values derived from a too-small sample size. - - :Parameters: - - sub_samples: *optional* - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the maximum. - - """ - return mask_where_too_few_values(1, *out) - - -# -------------------------------------------------------------------- -# Minimum -# -------------------------------------------------------------------- -def min_f(a, axis=None, masked=False): - """Return the minimum of an array or minimum along a specified axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: `bool`, optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the minimum. - - """ - (N,) = sample_size_f(a, axis=axis, masked=masked) - amin = numpy_amin(a, axis=axis) - - return asanyarray(N, amin) - - -def min_fpartial(out, out1=None, group=False): - """Return the partial minimum of an array. - - :Parameters: - - out: 2-`tuple` of `numpy.ndarray` - - out1: 2-`tuple` of `numpy.ndarray`, optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the minimum. - - """ - N, amin = out - - if out1 is not None: - N1, amin1 = out1 - N = psum(N, N1) - amin = pmin(amin, amin1) - - return asanyarray(N, amin) - - -def min_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the minimum of an - array. - - Here mask out any values derived from a too-small sample size. - - :Parameters: - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the minimum. - - """ - return mask_where_too_few_values(1, *out) - - -# -------------------------------------------------------------------- -# maximum_absolute_value -# -------------------------------------------------------------------- -def max_abs_f(a, axis=None, masked=False): - """Return the maximum of the absolute array, or the maximum of the - absolute array along an axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: bool - - :Returns: - - 2-tuple of numpy arrays - The sample sizes and the maxima of the absolute values. - - """ - return max_f(numpy_abs(a), axis=axis, masked=masked) - - -max_abs_fpartial = max_fpartial -max_abs_ffinalise = max_ffinalise - - -# -------------------------------------------------------------------- -# minimum_absolute_value -# -------------------------------------------------------------------- -def min_abs_f(a, axis=None, masked=False): - """Return the minimum of the absolute array, or the minimum of the - absolute array along an axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input is - used. - - masked: `bool` - - :Returns: - - 2-tuple of `numpy.ndarray` - The sample sizes and the minima of the absolute values. - - """ - return min_f(numpy_abs(a), axis=axis, masked=masked) - - -min_abs_fpartial = min_fpartial -min_abs_ffinalise = min_ffinalise # -------------------------------------------------------------------- @@ -664,223 +481,7 @@ def root_mean_square_ffinalise(out, sub_samples=None): return asanyarray(N, avg) -# -------------------------------------------------------------------- -# Mid-range: Average of maximum and minimum -# -------------------------------------------------------------------- -def mid_range_f(a, axis=None, masked=False): - """Return the minimum and maximum of an array or the minimum and - maximum along an axis. - - ``mid_range_f(a, axis=axis)`` is equivalent to ``(numpy.amin(a, - axis=axis), numpy.amax(a, axis=axis))`` - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - kwargs: ignored - - :Returns: - - 3-`tuple` - The sample size, minimum and maximum inside a 3-tuple. - - """ - (N,) = sample_size_f(a, axis=axis, masked=masked) - amin = numpy_amin(a, axis=axis) - amax = numpy_amax(a, axis=axis) - - if not numpy_ndim(amin): - # Make sure that we have a numpy array (as opposed to, e.g. a - # numpy.float64) - amin = numpy_asanyarray(amin) - amax = numpy_asanyarray(amax) - - return asanyarray(N, amin, amax) - - -def mid_range_fpartial(out, out1=None, group=False): - """Return the partial minimum and partial maximum of an array. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - - out1: 3-`tuple` of `numpy.ndarray`, optional - - group: ignored - - :Returns: - - 3-`tuple` - The sample size, minimum and maximum inside a 3-tuple. - - """ - N, amin, amax = out - - if out1 is not None: - N1, amin1, amax1 = out1 - - N = psum(N, N1) - amin = pmin(amin, amin1) - amax = pmax(amax, amax1) - - return asanyarray(N, amin, amax) - - -def mid_range_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the array mid-range - value. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: ordered sequence - - amin: `numpy.ndarray` - - amax: `numpy.ndarray` - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the mid-range value. - - """ - N, amin, amax = out - - amax = double_precision(amax) - - # Cast bool, unsigned int, and int to float64 - if issubclass(amax.dtype.type, (numpy_integer, numpy_bool_)): - amax = amax.astype(float) - - # Calculate the mid-range value, storing it in the amax variable - amax += amin - amax *= 0.5 - - return mask_where_too_few_values(1, N, amax) - - -# --------------------------------------------------------------------- -# Range: Absolute difference between maximum and minimum -# --------------------------------------------------------------------- -range_f = mid_range_f -range_fpartial = mid_range_fpartial - - -def range_ffinalise(out, sub_samples=None): - """Absolute difference between maximum and minimum. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - out: ordered sequence - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the range. - - """ - N, amin, amax = out - - # Calculate the range value, storing it in the amax variable - amax = double_precision(amax) - amax -= amin - - return mask_where_too_few_values(1, N, amax) - - -# --------------------------------------------------------------------- -# Sample size -# --------------------------------------------------------------------- -def sample_size_f(a, axis=None, masked=False): - """Return the sample size. - - :Parameters: - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - :Returns: - - `numpy.ndarray` - - """ - if masked: - N = numpy_sum(~a.mask, axis=axis, dtype=float) - if not numpy_ndim(N): - N = numpy_asanyarray(N) - else: - if axis is None: - N = numpy_array(a.size, dtype=float) - else: - shape = a.shape - N = numpy_empty(shape[:axis] + shape[axis + 1 :], dtype=float) - N[...] = shape[axis] - # --- End: if - - return asanyarray(N) - - -def sample_size_fpartial(out, out1=None, group=False): - """Return the partial sample size. - - :Parameters: - - out: ordered sequence of one numpy array - - :Returns: - - `numpy.ndarray` - - """ - (N,) = out - if out1 is not None: - (N1,) = out1 - N = psum(N, N1) - - return asanyarray(N) - - -def sample_size_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to give the sample size. - - :Parameters: - - out: ordered sequence of one numpy array - - sub_samples: optional - Ignored. - - :Returns: - - `tuple` - A 2-tuple containing *N* twice. - - """ - (N,) = out - return asanyarray(N, N) - - -# --------------------------------------------------------------------- -# sum -# --------------------------------------------------------------------- def sum_f(a, axis=None, weights=None, masked=False): """Return the sum of an array or the sum along an axis. @@ -1377,3 +978,508 @@ def sd_ffinalise(out, sub_samples=None): sd **= 0.5 return asanyarray(N, sd) + + +from dask.array import chunk +from dask.array.core import _concatenate2, broadcast_to +from dask.array.reductions import divide +from dask.utils import deepmap # Apply function inside nested lists + + +def combine_sample_sizes(pairs, axis, **kwargs): + # Create a nested list of N and recursively concatenate it + # along the specified axes + return combine_arrays(pairs, "N", chunk.sum, axis, int, False, **kwargs) + + +def combine_arrays( + pairs, key, func, axis, dtype, computing_meta=False, **kwargs +): + # Create a nested list of N and recursively concatenate it + # along the specified axes + x = deepmap(lambda pair: pair[key], pairs) if not computing_meta else pairs + if dtype: + kwargs["dtype"] = dtype + + x = func(_concatenate2(x, axes=axis), axis=axis, **kwargs) + return x + + +# -------------------------------------------------------------------- +# max +# -------------------------------------------------------------------- +def cf_max_chunk(x, computing_meta=False, **kwargs): + """Find the max of an array.""" + if computing_meta: + return x + + d = {"max": chunk.max(x, **kwargs)} + d.update(cf_sample_size_chunk(x, **kwargs)) + return d + + +def cf_max_combine( + pairs, + axis=None, + computing_meta=False, + **kwargs, +): + """Find the max and min of a nested list of arrays.""" + if not isinstance(pairs, list): + pairs = [pairs] + + # Create a nested list of maxima and recursively concatenate it + # along the specified axes + m = combine_arrays( + pairs, "max", chunk.max, axis, None, computing_meta, **kwargs + ) + if computing_meta: + return m + + return {"N": combine_sample_sizes(pairs, axis, **kwargs), "max": m} + + +def cf_max_agg( + pairs, + axis=None, + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Find the range of a nested list of arrays.""" + d = cf_max_combine(pairs, axis, computing_meta, **kwargs) + + m = d["max"] + m = mask_small_sample_size(m, d["N"], axis, mtol, original_shape) + + return m + + +def cf_max(a, axis=None, keepdims=False, mtol=1, split_every=None): + """TODODASK.""" + dtype = a.dtype + return reduction( + a, + cf_max_chunk, + partial(cf_max_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_max_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + + +# -------------------------------------------------------------------- +# maximum_absolute_value +# -------------------------------------------------------------------- +def cf_max_abs_chunk(x, computing_meta=False, **kwargs): + """Return the maximum of the absolute array, or the maximum of the + absolute array along an axis. + + :Parameters: + + a: numpy array_like + Input array + + axis: `int`, optional + Axis along which to operate. By default, flattened input + is used. + + masked: bool + + :Returns: + + 2-tuple of numpy arrays + The sample sizes and the maxima of the absolute values. + + """ + if computing_meta: + return x + + d = {"max": cf_max_chunk(np.abs(x), **kwargs)} + d.update(cf_sample_size_chunk(x, **kwargs)) + return d + + +cf_max_abs_combine = cf_max_combine +cf_max_abs_agg = cf_max_agg + +# -------------------------------------------------------------------- +# min +# -------------------------------------------------------------------- +def cf_min_chunk(x, computing_meta=False, **kwargs): + """Find the max of an array.""" + if computing_meta: + return x + + d = {"min": chunk.min(x, **kwargs)} + d.update(cf_sample_size_chunk(x, **kwargs)) + return d + + +def cf_min_combine( + pairs, + axis=None, + computing_meta=False, + **kwargs, +): + """Find the max and min of a nested list of arrays.""" + if not isinstance(pairs, list): + pairs = [pairs] + + # Create a nested list of maxima and recursively concatenate it + # along the specified axes + m = combine_arrays( + pairs, "min", chunk.min, axis, None, computing_meta, **kwargs + ) + if computing_meta: + return m + + return {"N": combine_sample_sizes(pairs, axis, **kwargs), "min": m} + + +def cf_min_agg( + pairs, + axis=None, + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Find the range of a nested list of arrays.""" + d = cf_min_combine(pairs, axis, computing_meta, **kwargs) + + m = d["min"] + m = mask_small_sample_size(m, d["N"], axis, mtol, original_shape) + + return m + + + +def cf_min(a, axis=None, keepdims=False, mtol=1, split_every=None): + """TODODASK.""" + dtype = a.dtype + return reduction( + a, + cf_min_chunk, + partial(cf_min_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_min_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + +# -------------------------------------------------------------------- +# minimum absolute value +# -------------------------------------------------------------------- +def cf_min_abs_chunk(x, computing_meta=False, **kwargs): + """Return the maximum of the absolute array, or the maximum of the + absolute array along an axis. + + :Parameters: + + a: numpy array_like + Input array + + axis: `int`, optional + Axis along which to operate. By default, flattened input + is used. + + masked: bool + + :Returns: + + 2-tuple of numpy arrays + The sample sizes and the maxima of the absolute values. + + """ + if computing_meta: + return x + + d = {"min": cf_min_chunk(np.abs(x), **kwargs)} + d.update(cf_sample_size_chunk(x, **kwargs)) + return d + + +cf_min_abs_combine = cf_min_combine +cf_min_abs_agg = cf_min_agg + +# -------------------------------------------------------------------- +# range +# -------------------------------------------------------------------- +def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Find the max and min of an array.""" + if computing_meta: + return x + + d = cf_max_chunk(x, **kwargs) + d.update(cf_min_chunk(x, **kwargs)) + d.update(cf_sample_size_chunk(x, dtype=dtype, **kwargs)) + return d + + +def cf_range_combine( + pairs, + axis=None, + dtype=None, + computing_meta=False, + **kwargs, +): + """Find the max and min of a nested list of arrays.""" + if not isinstance(pairs, list): + pairs = [pairs] + + # Create a nested list of maxima and recursively concatenate it + # along the specified axes + mx = combine_arrays( + pairs, "max", chunk.max, axis, None, computing_meta, **kwargs + ) + if computing_meta: + return mx + + mn = combine_arrays( + pairs, "min", chunk.min, axis, None, computing_meta, **kwargs + ) + + return { + "N": combine_sample_sizes(pairs, axis, **kwargs), + "max": mx, + "min": mn, + } + + +def cf_range_agg( + pairs, + axis=None, + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Find the range of a nested list of arrays.""" + d = cf_range_combine(pairs, axis, computing_meta, **kwargs) + + # Calculate the range + r = d["max"] - d["min"] + r = mask_small_sample_size(r, d["N"], axis, mtol, original_shape) + + return r + + +def cf_range(a, axis=None, keepdims=False, mtol=1, split_every=None): + """TODODASK.""" + dtype = a.dtype + return reduction( + a, + cf_range_chunk, + partial(cf_range_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_range_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + +# -------------------------------------------------------------------- +# mid-range +# -------------------------------------------------------------------- + +cf_mid_range_chunk = cf_range_chunk +cf_mid_range_combine = cf_range_combine + + +def cf_mid_range_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Find the mid-range of a nested list of arrays.""" + d = cf_range_combine(pairs, axis, dtype, computing_meta, **kwargs) + + # Calculate the mid-range + mr = d["max"] + d["min"] + mr = divide(mr, 2.0, dtype=dtype) + mr = mask_small_sample_size(mr, d["N"], axis, mtol, original_shape) + + return mr + + +def cf_mid_range( + a, axis=None, dtype=float, keepdims=False, mtol=1, split_every=None +): + """TODODASK.""" + dtype = float + return reduction( + a, + cf_mid_range_chunk, + partial(cf_mid_range_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_mid_range_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + +# -------------------------------------------------------------------- +# sample_size +# -------------------------------------------------------------------- +def cf_sample_size_chunk(x, dtype=int, computing_meta=False, **kwargs): + if computing_meta: + return x + + if np.ma.isMA(x): + N = chunk.sum(~x.mask, dtype=dtype, **kwargs) + if not np.ndim(N): + N = np.asanyarray(N) + else: + axis = kwargs["axis"] + shape = [1 if i in axis else n for i, n in enumerate(x.shape)] + size = reduce(mul, [n for i, n in enumerate(x.shape) if i in axis], 1) + N = np.full(shape, size, dtype=dtype) + + return {"N": N} + + +def cf_sample_size_combine( + pairs, + axis=None, + computing_meta=False, + **kwargs, +): + if not isinstance(pairs, list): + pairs = [pairs] + + N = combine_arrays(pairs, "N", chunk.sum, axis, None, + computing_meta, **kwargs) + if computing_meta: + return N + + return {"N": N} + + +def cf_sample_size_agg( + pairs, + axis=None, + computing_meta=False, + func=None, + mtol=1, + original_shape=None, + **kwargs, +): + d = cf_sample_size_combine(pairs, axis, computing_meta, **kwargs) + + N = d["N"] + N = mask_small_sample_size(N, N, axis, mtol, original_shape) + return N + + +def cf_sample_size(a, axis=None, keepdims=False, mtol=1, split_every=None): + """TODODASK.""" + dtype = int + return reduction( + a, + cf_sample_size_chunk, + partial(cf_sample_size_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sample_size_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + + +# -------------------------------------------------------------------- +# sum +# -------------------------------------------------------------------- +def cf_sum_chunk(x, dtype=float, computing_meta=False, func=None, **kwargs): + """Find the max of an array.""" + if computing_meta: + return x + + d = {"sum": chunk.sum(x, **kwargs)} + d.update(cf_sample_size_chunk(x, dtype=dtype, **kwargs)) + return d + + +def cf_sum_combine( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + func=None, + **kwargs, +): + """Apply the function to the data in a nested list of arrays.""" + if not isinstance(pairs, list): + pairs = [pairs] + + # Create a nested list of maxima and recursively concatenate it + # along the specified axes + x = combine_arrays( + pairs, "sum", chunk.sum, axis, dtype, computing_meta, **kwargs + ) + if computing_meta: + return x + + return {"N": combine_sample_sizes(pairs, axis, **kwargs), "sum": x} + + +def cf_sum_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + func=None, + mtol=1, + original_shape=None, + **kwargs, +): + """Apply the function to the data in a nested list of arrays and + mask where the sample size is below the threshold.""" + d = cf_sum_combine(pairs, axis, computing_meta, **kwargs) + + x = mask_small_sample_size(d["sum"], d["N"], axis, mtol, original_shape) + return x + + +def cf_sum(a, axis=None, keepdims=False, mtol=1, split_every=None): + """TODODASK.""" + func = chunk.sum + dtype = float + return reduction( + a, + partial(cf_sum_chunk, func=func), + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + ) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 6429fd2afe..810d88cbc0 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -15,6 +15,8 @@ from ..functions import atol as cf_atol from ..functions import rtol as cf_rtol +# from dask.utils import deepmap # Apply function inside nested lists + def _da_ma_allclose(x, y, masked_equal=True, rtol=None, atol=None): """An effective dask.array.ma.allclose method. diff --git a/cf/data/data.py b/cf/data/data.py index b6ec1c54a6..8bff606e7c 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -17,6 +17,7 @@ from dask.base import is_dask_collection, tokenize from dask.core import flatten from dask.highlevelgraph import HighLevelGraph +from dask.array.routines import result_type from numpy.testing import suppress_warnings as numpy_testing_suppress_warnings from ..cfdatetime import dt as cf_dt @@ -49,40 +50,19 @@ NetCDFArray, UMArray, ) -from .collapse_functions import ( # max_f,; max_ffinalise,; max_fpartial, - max_abs_f, - max_abs_ffinalise, - max_abs_fpartial, +from .collapse_functions import ( mean_abs_f, mean_abs_ffinalise, mean_abs_fpartial, mean_f, mean_ffinalise, mean_fpartial, - mid_range_f, - mid_range_ffinalise, - mid_range_fpartial, - min_abs_f, - min_abs_ffinalise, - min_abs_fpartial, - min_f, - min_ffinalise, - min_fpartial, - range_f, - range_ffinalise, - range_fpartial, root_mean_square_f, root_mean_square_ffinalise, root_mean_square_fpartial, - sample_size_f, - sample_size_ffinalise, - sample_size_fpartial, sd_f, sd_ffinalise, sd_fpartial, - sum_f, - sum_ffinalise, - sum_fpartial, sum_of_squares_f, sum_of_squares_ffinalise, sum_of_squares_fpartial, @@ -8246,36 +8226,30 @@ def integral( _preserve_partitions=_preserve_partitions, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") def sample_size( self, axes=None, squeeze=False, mtol=1, inplace=False, + split_every=None, i=False, - _preserve_partitions=False, ): - """Collapses axes with their sample size. - - :Parameters: - - {{inplace: `bool`, optional}} - - {{i: deprecated at version 3.0.0}} + from .collapse_functions import cf_sample_size - """ - return self._collapse( - sample_size_f, - sample_size_fpartial, - sample_size_ffinalise, - axes=axes, - squeeze=squeeze, - weights=None, + d = _inplace_enabled_define_and_cleanup(self) + d = _collapse( + d, + cf_sample_size, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - units=Units("1"), - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d @property def binary_mask(self): @@ -10779,29 +10753,29 @@ def masked_all(cls, shape, dtype=None, units=None, chunk=True): return cls(array, units=units, chunk=chunk) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def mid_range( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, - _preserve_partitions=False, i=False, ): - """Collapse axes with the unweighted average of their maximum - and minimum values. + """Collapse axes with the absolute difference between their + maximum and minimum values. Missing data array elements are omitted from the calculation. - .. seealso:: `maximum`, `minimum`, `mean`, `range`, `sum`, `sd`, `var` + .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sample_size`, + `sd`, `sum`, `sum_of_weights`, `sum_of_weights2`, + `var` :Parameters: - axes: (sequence of) `int`, optional - - squeeze: `bool`, optional - {{inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} @@ -10814,16 +10788,18 @@ def mid_range( **Examples:** """ - return self._collapse( - mid_range_f, - mid_range_fpartial, - mid_range_ffinalise, - axes=axes, - squeeze=squeeze, + from .collapse_functions import cf_mid_range + + d = _inplace_enabled_define_and_cleanup(self) + d = _collapse( + d, + cf_mid_range, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @@ -12682,14 +12658,16 @@ def func( return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def range( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, - _preserve_partitions=False, i=False, ): """Collapse axes with the absolute difference between their @@ -12703,6 +12681,28 @@ def range( :Parameters: + split_every: `int` or `dict`, optional + Determines the depth of the recursive aggregation. If + set to a number greater an oe equal to the number of + input chunks, the aggregation will be performed in two + steps, one ``chunk`` function per input chunk and a + single ``aggregate`` function at the end. If set to + less than that (and greater than 1), an intermediate + ``combine`` function will be used, so that any one + ``combine`` or ``aggregate`` function has no more than + ``split_every`` inputs. The depth of the aggregation + graph will be :math:`log_{split_every}(input chunks + along reduced axes)`. Setting to a low value can + reduce cache size and network transfers, at the cost + of more CPU and a larger dask graph. + + Different values can be assigned to different axes in + a dictionary. + + Omit to let dask heuristically decide a good + default. A default can also be set globally with the + ``split_every`` key in :mod:`dask.config`. + {{inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} @@ -12715,17 +12715,18 @@ def range( **Examples:** """ - return self._collapse( - range_f, - range_fpartial, - range_ffinalise, - axes=axes, - squeeze=squeeze, - weights=None, + from .collapse_functions import cf_range + + d = _inplace_enabled_define_and_cleanup(self) + d = _collapse( + d, + cf_range, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @@ -12778,54 +12779,33 @@ def roll(self, axis, shift, inplace=False, i=False): return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def sum( self, axes=None, squeeze=False, mtol=1, - weights=None, inplace=False, + split_every=None, i=False, _preserve_partitions=False, ): - """Collapse axes with their sum. - - Missing data array elements are omitted from the calculation. - - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `range`, - `sample_size`, `sd`, `sum_of_weights`, - `sum_of_weights2`, `var` - - :Parameters: - - axes : (sequence of) int, optional - - squeeze : bool, optional - - {{inplace: `bool`, optional}} + from .collapse_functions import cf_sum - {{i: deprecated at version 3.0.0}} - - :Returns: - - `Data` or `None` - The collapsed array. - - **Examples:** - - """ - return self._collapse( - sum_f, - sum_fpartial, - sum_ffinalise, - axes=axes, - squeeze=squeeze, - weights=weights, + d = _inplace_enabled_define_and_cleanup(self) + dx = d._get_dask() + dx = cf_sum( + dx, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + d._set_dask(dx, reset_mask_hardness=True) + + return d def sum_of_squares( self, @@ -13290,6 +13270,36 @@ def section( self, axes, data=True, stop=stop, chunks=chunks, min_step=min_step ) + + + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def sumw( + self, + axes=None, + weights=None, + squeeze=False, + mtol=1, + inplace=False, + split_every=None, + i=False, + ): + from .collapse_functions import cf_sumw + + d = _inplace_enabled_define_and_cleanup(self) + + d = _collapse( + d, + cf_sumw, + axis=axes, + weights=weights, + keepdims=not squeeze, + split_every=split_every, + mtol=mtol, + ) + return d + # ---------------------------------------------------------------- # Alias # ---------------------------------------------------------------- @@ -13298,43 +13308,56 @@ def dtarray(self): """Alias for `datetime_array`""" return self.datetime_array + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") def max( self, axes=None, squeeze=False, mtol=1, inplace=False, + split_every=None, i=False, - _preserve_partitions=False, ): - """Alias for `maximum`""" - return self.maximum( - axes=axes, - squeeze=squeeze, + from .collapse_functions import cf_max + + d = _inplace_enabled_define_and_cleanup(self) + d = _collapse( + d, + cf_max, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - i=i, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") def min( self, axes=None, squeeze=False, mtol=1, inplace=False, + split_every=None, i=False, _preserve_partitions=False, ): - """Alias for `minimum`""" - return self.minimum( - axes=axes, - squeeze=squeeze, + from .collapse_functions import cf_min + + d = _inplace_enabled_define_and_cleanup(self) + d = _collapse( + d, + cf_min, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - i=i, - _preserve_partitions=_preserve_partitions, ) + return d def standard_deviation( self, @@ -13607,3 +13630,35 @@ def _where_broadcastable(data, x, name): ) return True + + +def _collapse( + d, collapse_func, axis=None, weights=None, keepdims=True, split_every=None, mtol=1 +): + """TODODASK.""" + dx = d._get_dask() + + if weights is not None: + weights = da.asanyarray(dask_compatible(weights)) + + if issubclass(a.dtype.type, (np.integer, np.bool_)): + result_dtype = result_type(a.dtype, weights.dtype, "f8") + else: + result_dtype = result_type(a.dtype, weights.dtype) + + # Sort out broadcasting of weights!!! + + dx = multiply(dx, weights, dtype=result_dtype) + + need to dived by sum of weiughts, sometimes + + dx = collapse_func( + dx, + axis=axis, + keepdims=keepdims, + split_every=split_every, + mtol=mtol, + ) + d._set_dask(dx, reset_mask_hardness=True) + + return d From 4eca601257518f8df7384e40ecdc1091323127c2 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 11 Mar 2022 19:14:33 +0000 Subject: [PATCH 02/37] dev --- cf/data/collapse_functions.py | 80 +++++++++++++++++++++++++++++++++++ cf/data/data.py | 14 +----- 2 files changed, 81 insertions(+), 13 deletions(-) diff --git a/cf/data/collapse_functions.py b/cf/data/collapse_functions.py index c092d58d8a..e99cac063e 100644 --- a/cf/data/collapse_functions.py +++ b/cf/data/collapse_functions.py @@ -983,6 +983,7 @@ def sd_ffinalise(out, sub_samples=None): from dask.array import chunk from dask.array.core import _concatenate2, broadcast_to from dask.array.reductions import divide +from dask.array.ufunc import multiply from dask.utils import deepmap # Apply function inside nested lists @@ -1483,3 +1484,82 @@ def cf_sum(a, axis=None, keepdims=False, mtol=1, split_every=None): concatenate=False, meta=np.array((), dtype=dtype), ) + +# -------------------------------------------------------------------- +# sum +# -------------------------------------------------------------------- +def cf_sumw_chunk(x, weights=None, dtype=float, computing_meta=False, + func=None, **kwargs): + """Find the max of an array.""" + print('PPP@',x, weights) + if computing_meta: + return x + + if weights is not None: + x = multiply(x, weights) # sort out dtype=result_dtype) + + d = {"sum": chunk.sum(x, **kwargs)} + d.update(cf_sample_size_chunk(x, dtype=dtype, **kwargs)) + + return d + + +def cf_sumw_combine( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + func=None, + **kwargs, +): + """Apply the function to the data in a nested list of arrays.""" + if not isinstance(pairs, list): + pairs = [pairs] + + # Create a nested list of maxima and recursively concatenate it + # along the specified axes + x = combine_arrays( + pairs, "sum", chunk.sum, axis, dtype, computing_meta, **kwargs + ) + if computing_meta: + return x + + return {"N": combine_sample_sizes(pairs, axis, **kwargs), "sum": x} + + +def cf_sumw_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + func=None, + mtol=1, + original_shape=None, + **kwargs, +): + """Apply the function to the data in a nested list of arrays and + mask where the sample size is below the threshold.""" + d = cf_sum_combine(pairs, axis, computing_meta, **kwargs) + + x = mask_small_sample_size(d["sum"], d["N"], axis, mtol, original_shape) + return x + + +def cf_sumw(a, axis=None, keepdims=False, mtol=1, split_every=None): + """TODODASK.""" + + func = chunk.sum + dtype = float + return reduction( + a, + partial(cf_sumw_chunk, func=func), + partial(cf_sum_agg, mtol=mtol, original_shape=a[0].shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + ) diff --git a/cf/data/data.py b/cf/data/data.py index 8bff606e7c..1aea089748 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -13288,7 +13288,6 @@ def sumw( from .collapse_functions import cf_sumw d = _inplace_enabled_define_and_cleanup(self) - d = _collapse( d, cf_sumw, @@ -13641,19 +13640,8 @@ def _collapse( if weights is not None: weights = da.asanyarray(dask_compatible(weights)) - if issubclass(a.dtype.type, (np.integer, np.bool_)): - result_dtype = result_type(a.dtype, weights.dtype, "f8") - else: - result_dtype = result_type(a.dtype, weights.dtype) - - # Sort out broadcasting of weights!!! - - dx = multiply(dx, weights, dtype=result_dtype) - - need to dived by sum of weiughts, sometimes - dx = collapse_func( - dx, + (dx, weights), axis=axis, keepdims=keepdims, split_every=split_every, From 560980530470f51e09a8a7f0278ac6520a93b4ea Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sat, 12 Mar 2022 19:23:52 +0000 Subject: [PATCH 03/37] dev --- cf/data/collapse_functions.py | 580 ++++++++++++++++++++++++++-------- cf/data/data.py | 393 ++++++++++++----------- cf/data/utils.py | 2 +- 3 files changed, 662 insertions(+), 313 deletions(-) diff --git a/cf/data/collapse_functions.py b/cf/data/collapse_functions.py index e99cac063e..3420170bfb 100644 --- a/cf/data/collapse_functions.py +++ b/cf/data/collapse_functions.py @@ -243,7 +243,7 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): Nmax = reduce(mul, [original_shape[i] for i in axis], 1) x = np.ma.masked_where(N < (1 - mtol) * Nmax, x, copy=False) - return x + return asanyarray(x)[0] def double_precision(a): @@ -986,6 +986,23 @@ def sd_ffinalise(out, sub_samples=None): from dask.array.ufunc import multiply from dask.utils import deepmap # Apply function inside nested lists +def sum_of_weights(x, weights=None, N=None, squared=False, **kwargs): + """TODO""" + if weights is None: + if N is None: + N = cf_sample_size_chunk(x, **kwargs)["N"] + + sw = N + else: + if squared: + weights = multiply(weights, weights, dtype=float) + + if np.ma.is_masked(x): + weights = np.ma.masked_where(x.mask, weights) + + sw = weights.sum(**kwargs) + + return sw def combine_sample_sizes(pairs, axis, **kwargs): # Create a nested list of N and recursively concatenate it @@ -1006,6 +1023,91 @@ def combine_arrays( return x +def sum_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + # Create a nested list of N and recursively concatenate it + # along the specified axes + return combine_arrays(pairs, key, chunk.sum, axis, dtype, + computing_meta, **kwargs) + + +# -------------------------------------------------------------------- +# mean +# -------------------------------------------------------------------- +def cf_mean_chunk(x, weights=None, dtype=float, computing_meta=False, **kwargs): + """Find the max of an array.""" + if computing_meta: + return x + + d = cf_sum_chunk(x, weights=weights, **kwargs) + + if weights is None: + sw = d["N"] + else: + sw = chunk.sum(weights, **kwargs) + + d["sw"] = sw + return d + +def cf_mean_combine( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + **kwargs, +): + """Apply the function to the data in a nested list of arrays.""" + if not isinstance(pairs, list): + pairs = [pairs] + + d = {} + for key in ("sum", "sw"): + d[key] = sum_arrays(pairs, key, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return x + + d["N"] = combine_sample_sizes(pairs, axis, **kwargs) + return d + +def cf_mean_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=1, + original_shape=None, + **kwargs, +): + """Apply the function to the data in a nested list of arrays and + mask where the sample size is below the threshold.""" + d = cf_mean_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d + + x = divide(d["sum"], d["sw"]) # dtype? + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +def cf_mean(a, axis=None, weights=None, keepdims=False, mtol=1, + split_every=None): + """TODODASK.""" + dtype = float + return reduction( + a, + cf_mean_chunk, + partial(cf_mean_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_mean_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights + ) + + # -------------------------------------------------------------------- # max # -------------------------------------------------------------------- @@ -1014,9 +1116,10 @@ def cf_max_chunk(x, computing_meta=False, **kwargs): if computing_meta: return x - d = {"max": chunk.max(x, **kwargs)} - d.update(cf_sample_size_chunk(x, **kwargs)) - return d + return { + "max": chunk.max(x, **kwargs), + "N": cf_sample_size_chunk(x, **kwargs)["N"], + } def cf_max_combine( @@ -1037,7 +1140,10 @@ def cf_max_combine( if computing_meta: return m - return {"N": combine_sample_sizes(pairs, axis, **kwargs), "max": m} + return { + "max": m, + "N": combine_sample_sizes(pairs, axis, **kwargs), + } def cf_max_agg( @@ -1050,11 +1156,13 @@ def cf_max_agg( ): """Find the range of a nested list of arrays.""" d = cf_max_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d + - m = d["max"] - m = mask_small_sample_size(m, d["N"], axis, mtol, original_shape) - - return m + x = d["max"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x def cf_max(a, axis=None, keepdims=False, mtol=1, split_every=None): @@ -1102,10 +1210,8 @@ def cf_max_abs_chunk(x, computing_meta=False, **kwargs): """ if computing_meta: return x - - d = {"max": cf_max_chunk(np.abs(x), **kwargs)} - d.update(cf_sample_size_chunk(x, **kwargs)) - return d + + return cf_max_chunk(np.abs(x), **kwargs) cf_max_abs_combine = cf_max_combine @@ -1119,9 +1225,10 @@ def cf_min_chunk(x, computing_meta=False, **kwargs): if computing_meta: return x - d = {"min": chunk.min(x, **kwargs)} - d.update(cf_sample_size_chunk(x, **kwargs)) - return d + return { + "min": chunk.min(x, **kwargs), + "N": cf_sample_size_chunk(x, **kwargs)["N"], + } def cf_min_combine( @@ -1136,13 +1243,16 @@ def cf_min_combine( # Create a nested list of maxima and recursively concatenate it # along the specified axes - m = combine_arrays( + x = combine_arrays( pairs, "min", chunk.min, axis, None, computing_meta, **kwargs ) if computing_meta: return m - return {"N": combine_sample_sizes(pairs, axis, **kwargs), "min": m} + return { + "min": x, + "N": combine_sample_sizes(pairs, axis, **kwargs), + } def cf_min_agg( @@ -1155,11 +1265,12 @@ def cf_min_agg( ): """Find the range of a nested list of arrays.""" d = cf_min_combine(pairs, axis, computing_meta, **kwargs) - - m = d["min"] - m = mask_small_sample_size(m, d["N"], axis, mtol, original_shape) - - return m + if computing_meta: + return d + + x = d["min"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x @@ -1207,9 +1318,7 @@ def cf_min_abs_chunk(x, computing_meta=False, **kwargs): if computing_meta: return x - d = {"min": cf_min_chunk(np.abs(x), **kwargs)} - d.update(cf_sample_size_chunk(x, **kwargs)) - return d + return cf_min_chunk(np.abs(x), **kwargs) cf_min_abs_combine = cf_min_combine @@ -1224,9 +1333,8 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): return x d = cf_max_chunk(x, **kwargs) - d.update(cf_min_chunk(x, **kwargs)) - d.update(cf_sample_size_chunk(x, dtype=dtype, **kwargs)) - return d + d["min"] = chunk.min(x, **kwargs) + return d def cf_range_combine( @@ -1253,9 +1361,9 @@ def cf_range_combine( ) return { - "N": combine_sample_sizes(pairs, axis, **kwargs), "max": mx, "min": mn, + "N": combine_sample_sizes(pairs, axis, **kwargs), } @@ -1269,12 +1377,13 @@ def cf_range_agg( ): """Find the range of a nested list of arrays.""" d = cf_range_combine(pairs, axis, computing_meta, **kwargs) - + if computing_meta: + return d + # Calculate the range - r = d["max"] - d["min"] - r = mask_small_sample_size(r, d["N"], axis, mtol, original_shape) - - return r + x = d["max"] - d["min"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x def cf_range(a, axis=None, keepdims=False, mtol=1, split_every=None): @@ -1293,11 +1402,9 @@ def cf_range(a, axis=None, keepdims=False, mtol=1, split_every=None): concatenate=False, meta=np.array((), dtype=dtype), ) - # -------------------------------------------------------------------- # mid-range # -------------------------------------------------------------------- - cf_mid_range_chunk = cf_range_chunk cf_mid_range_combine = cf_range_combine @@ -1313,13 +1420,14 @@ def cf_mid_range_agg( ): """Find the mid-range of a nested list of arrays.""" d = cf_range_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + # Calculate the mid-range - mr = d["max"] + d["min"] - mr = divide(mr, 2.0, dtype=dtype) - mr = mask_small_sample_size(mr, d["N"], axis, mtol, original_shape) - - return mr + x = divide(d["max"] + d["min"], 2.0, dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x def cf_mid_range( @@ -1341,6 +1449,7 @@ def cf_mid_range( meta=np.array((), dtype=dtype), ) + # -------------------------------------------------------------------- # sample_size # -------------------------------------------------------------------- @@ -1349,7 +1458,7 @@ def cf_sample_size_chunk(x, dtype=int, computing_meta=False, **kwargs): return x if np.ma.isMA(x): - N = chunk.sum(~x.mask, dtype=dtype, **kwargs) + N = chunk.sum(~np.ma.getmaskarray(x), dtype=dtype, **kwargs) if not np.ndim(N): N = np.asanyarray(N) else: @@ -1370,28 +1479,29 @@ def cf_sample_size_combine( if not isinstance(pairs, list): pairs = [pairs] - N = combine_arrays(pairs, "N", chunk.sum, axis, None, + x = combine_arrays(pairs, "N", chunk.sum, axis, None, computing_meta, **kwargs) if computing_meta: - return N - - return {"N": N} + return x + + return {"N": x} def cf_sample_size_agg( pairs, axis=None, computing_meta=False, - func=None, mtol=1, original_shape=None, **kwargs, ): d = cf_sample_size_combine(pairs, axis, computing_meta, **kwargs) - - N = d["N"] - N = mask_small_sample_size(N, N, axis, mtol, original_shape) - return N + if computing_meta: + return d + + x = d["N"] + x = mask_small_sample_size(x, x, axis, mtol, original_shape) + return x def cf_sample_size(a, axis=None, keepdims=False, mtol=1, split_every=None): @@ -1416,13 +1526,16 @@ def cf_sample_size(a, axis=None, keepdims=False, mtol=1, split_every=None): # -------------------------------------------------------------------- # sum # -------------------------------------------------------------------- -def cf_sum_chunk(x, dtype=float, computing_meta=False, func=None, **kwargs): +def cf_sum_chunk(x, weights=None, dtype=float, computing_meta=False, **kwargs): """Find the max of an array.""" if computing_meta: return x - d = {"sum": chunk.sum(x, **kwargs)} - d.update(cf_sample_size_chunk(x, dtype=dtype, **kwargs)) + if weights is not None: + x = multiply(x, weights) # sort out dtype=result_dtype) + + d = cf_sample_size_chunk(x, **kwargs) + d["sum"] = chunk.sum(x, dtype=dtype, **kwargs) return d @@ -1431,7 +1544,6 @@ def cf_sum_combine( axis=None, dtype="f8", computing_meta=False, - func=None, **kwargs, ): """Apply the function to the data in a nested list of arrays.""" @@ -1440,13 +1552,14 @@ def cf_sum_combine( # Create a nested list of maxima and recursively concatenate it # along the specified axes - x = combine_arrays( - pairs, "sum", chunk.sum, axis, dtype, computing_meta, **kwargs - ) + x = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) if computing_meta: return x - return {"N": combine_sample_sizes(pairs, axis, **kwargs), "sum": x} + return { + "sum": x, + "N": combine_sample_sizes(pairs, axis, **kwargs), + } def cf_sum_agg( @@ -1454,7 +1567,6 @@ def cf_sum_agg( axis=None, dtype="f8", computing_meta=False, - func=None, mtol=1, original_shape=None, **kwargs, @@ -1462,18 +1574,20 @@ def cf_sum_agg( """Apply the function to the data in a nested list of arrays and mask where the sample size is below the threshold.""" d = cf_sum_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d - x = mask_small_sample_size(d["sum"], d["N"], axis, mtol, original_shape) + x = d["sum"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) return x -def cf_sum(a, axis=None, keepdims=False, mtol=1, split_every=None): +def cf_sum(a, axis=None, weights=None, keepdims=False, mtol=1, split_every=None): """TODODASK.""" - func = chunk.sum dtype = float return reduction( a, - partial(cf_sum_chunk, func=func), + cf_sum_chunk, partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -1483,83 +1597,291 @@ def cf_sum(a, axis=None, keepdims=False, mtol=1, split_every=None): out=None, concatenate=False, meta=np.array((), dtype=dtype), + weights=weights ) - # -------------------------------------------------------------------- -# sum +# variance # -------------------------------------------------------------------- -def cf_sumw_chunk(x, weights=None, dtype=float, computing_meta=False, - func=None, **kwargs): - """Find the max of an array.""" - print('PPP@',x, weights) - if computing_meta: - return x +def cf_var_chunk(x, weights=None, dtype=float, computing_meta=False, + ddof=0, **kwargs): + """Return a tuple containing metrics relating to the array variance. - if weights is not None: - x = multiply(x, weights) # sort out dtype=result_dtype) + The tuple is a 7-tuple that contains, in the order given, the + following variables: + + ======== ============================================================ + Variable Description + ======== ============================================================ + N Sample size + + var Sample variance (ddof=0) + + avg Weighted mean + + V1 Sum of weights + + V2 Sum of squares of weights + + ddof Delta degrees of freedom + + weighted Whether or not the sample is weighted + ======== ============================================================ + + :Parameters: + + a: numpy array_like + Input array + + axis: `int`, optional + Axis along which to operate. By default, flattened input + is used. + + weights: numpy array-like, optional + Weights associated with values of the array. By default the + statistics are unweighted. + + masked: `bool`, optional + + ddof: delta degrees of freedom, optional + + :Returns: + + 7-`tuple` + Tuple containing the value of the statistical metrics described + in the above table, in the given order. + + """ + # Make sure that a is double precision + a = double_precision(a) + + weighted = weights is not None + + # ---------------------------------------------------------------- + # Methods: + # + # http://en.wikipedia.org/wiki/Standard_deviation#Population-based_statistics + # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance + # ---------------------------------------------------------------- + + # Calculate N = number of data points + # Calculate avg = mean of data + # Calculate V1 = sum of weights + d = cf_mean_chunk(x, weights, dtype=dtype, **kwargs) + + N = d["N"] + V1 = d["sw"] + avg = d["avg"] #divide(d"sum"], V1) # dtype - d = {"sum": chunk.sum(x, **kwargs)} - d.update(cf_sample_size_chunk(x, dtype=dtype, **kwargs)) +# N, avg, V1 = mean_f(a, weights=weights, axis=axis, masked=masked) - return d + # Calculate V2 = sum of squares of weights + if weights is not None and ddof == 1: + V2 = sum_of_weights(x, weights, N=N, squared=True, **kwargs) + else: + V2 = None +# if axis is not None and avg.size > 1: +# # We collapsed over a single axis and the array has 2 or more +# # axes, so add an extra size 1 axis to the mean so that +# # broadcasting works when we calculate the variance. +# reshape_avg = True +# if masked: +# expand_dims = numpy_ma_expand_dims +# else: +# expand_dims = numpy_expand_dims +# +# avg = expand_dims(avg, axis) +# else: +# reshape_avg = False + + var = x - avg + var *= var -def cf_sumw_combine( - pairs, - axis=None, - dtype="f8", - computing_meta=False, - func=None, - **kwargs, -): - """Apply the function to the data in a nested list of arrays.""" - if not isinstance(pairs, list): - pairs = [pairs] + if np.ma.isMA(var): + average = np.ma.average + else: + average = np.average - # Create a nested list of maxima and recursively concatenate it - # along the specified axes - x = combine_arrays( - pairs, "sum", chunk.sum, axis, dtype, computing_meta, **kwargs - ) - if computing_meta: - return x + var = average(var, weights=weights, **kwargs) - return {"N": combine_sample_sizes(pairs, axis, **kwargs), "sum": x} +# if reshape_avg: +# shape = avg.shape +# avg = avg.reshape(shape[:axis] + shape[axis + 1 :]) +# (N, var, avg, V1, V2) = asanyarray(N, var, avg, V1, V2) -def cf_sumw_agg( - pairs, - axis=None, - dtype="f8", - computing_meta=False, - func=None, - mtol=1, - original_shape=None, - **kwargs, -): - """Apply the function to the data in a nested list of arrays and - mask where the sample size is below the threshold.""" - d = cf_sum_combine(pairs, axis, computing_meta, **kwargs) + - x = mask_small_sample_size(d["sum"], d["N"], axis, mtol, original_shape) - return x + return {"var": var, + "avg": avg, + "N": N, + "V1": V1, + "V2": V2, +# "ddof": ddof, +# "weighted": weighted, + } -def cf_sumw(a, axis=None, keepdims=False, mtol=1, split_every=None): - """TODODASK.""" - func = chunk.sum - dtype = float - return reduction( - a, - partial(cf_sumw_chunk, func=func), - partial(cf_sum_agg, mtol=mtol, original_shape=a[0].shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_sum_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - ) +def cf_var_combine(out, out1=None, group=False): + """Return a tuple of partial metrics relating to the array variance. + + The tuple is a 7-tuple that contains, in the order given, the + following variables: + + ======== ============================================================ + Variable Description + ======== ============================================================ + N Partial sample size + + var Partial sum of V1*(variance + mean^2) + + avg Unweighted partial sum + + V1 Partial sum of weights + + V2 Partial sum of squares of weights + + ddof Delta degrees of freedom + + weighted Whether or not the population is weighted + ======== ============================================================ + + For further information, see: + https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics + + :Parameters: + + out: 7-`tuple` + + out1: 7-`tuple`, optional + + :Returns: + + 7-`tuple` + Tuple containing the value of the statistical metrics described + in the above table, in the given order. + + """ + (N, var, avg, V1, V2, ddof, weighted) = out + + if out1 is None and not group: + # ------------------------------------------------------------ + # var = V1(var+avg**2) + # avg = V1*avg = unweighted partial sum + # ------------------------------------------------------------ + var += avg * avg + var *= V1 + avg *= V1 + else: + # ------------------------------------------------------------ + # var = var + V1b(varb+avgb**2) + # avg = avg + V1b*avgb + # V1 = V1 + V1b + # V2 = V2 + V2b + # ------------------------------------------------------------ + (Nb, varb, avgb, V1b, V2b, ddof, weighted) = out1 + + N = psum(N, Nb) + + if not group: + varb += avgb * avgb + varb *= V1b + avgb *= V1b + + var = psum(var, varb) + avg = psum(avg, avgb) + V1 = psum(V1, V1b) + + if weighted and ddof == 1: + V2 = psum(V2, V2b) + # --- End: if + + (N, var, avg, V1, V2) = asanyarray(N, var, avg, V1, V2) + + return (N, var, avg, V1, V2, ddof, weighted) + +def var_ffinalise(out, sub_samples=None): + """Calculate the variance of the array and return it with the sample + size. + + Also mask out any values derived from a too-small sample size. + + :Parameters: + + out: 7-`tuple` + + sub_samples: optional + + :Returns: + + 2-`tuple` of `numpy.ndarray` + The sample size and the variance. + + """ + (N, var, avg, V1, V2, ddof, weighted) = out + + N, var = mask_where_too_few_values(max(2, ddof + 1), N, var) + N, V1 = mask_where_too_few_values(max(2, ddof + 1), N, V1) + if V2 is not None: + N, V2 = mask_where_too_few_values(max(2, ddof + 1), N, V2) + + if sub_samples: + # ---------------------------------------------------------------- + # The global biased variance = {[SUM(pV1(pv+pm**2)]/V1} - m**2 + # + # where pV1 = partial sum of weights + # pv = partial biased variance + # pm = partial mean + # V1 = global sum of weights + # m = global mean + # + # Currently: var = SUM(pV1(pv+pm**2) + # avg = V1*m + # + # https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics + # + # For the general case of M non-overlapping data sets, X_{1} + # through X_{M}, and the aggregate data set X=\bigcup_{i}X_{i} + # we have the unweighted mean and variance is: + # + # \mu_{X}={\frac{1}{\sum_{i}{N_{X_{i}}}}}\left(\sum_{i}{N_{X_{i}}\mu_{X_{i}}}\right) + # + # var_{X}={{\frac{1}{\sum_{i}{N_{X_{i}}-ddof}}}\left(\sum_{i}{\left[(N_{X_{i}}-1)\sigma_{X_{i}}^{2}+N_{X_{i}}\mu_{X_{i}}^{2}\right]}-\left[\sum_{i}{N_{X_{i}}}\right]\mu_{X}^{2}\right)} + # + # ---------------------------------------------------------------- + avg /= V1 + avg *= avg + var /= V1 + var -= avg + + # ---------------------------------------------------------------- + # var is now the biased global variance + # ---------------------------------------------------------------- + if not weighted: + if ddof: + # The unweighted variance with N-ddof degrees of freedom is + # [V1/(V1-ddof)]*var. In this case V1 equals the sample size, + # N. ddof=1 provides an unbiased estimator of the variance of + # a hypothetical infinite population. + V1 /= V1 - ddof + var *= V1 + elif ddof == 1: + # Calculate the weighted unbiased variance. The unbiased + # variance weighted with _reliability_ weights is + # [V1**2/(V1**2-V2)]*var. + # + # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance + V1 **= 2 + var *= V1 + V1 -= V2 + var /= V1 + elif ddof: + raise ValueError( + "Can only calculate a weighted variance with a delta degrees " + "of freedom (ddof) of 0 or 1: Got {}".format(ddof) + ) + + return asanyarray(N, var) + + diff --git a/cf/data/data.py b/cf/data/data.py index 1aea089748..c2d31a3b61 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -7894,183 +7894,203 @@ def minimum_absolute_value( _preserve_partitions=_preserve_partitions, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def mean( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, inplace=False, + split_every=None, i=False, - _preserve_partitions=False, ): - """Collapse axes with their mean. - - The mean is unweighted by default, but may be weighted (see the - *weights* parameter). - - Missing data array elements and their corresponding weights - are omitted from the calculation. - - :Parameters: - - axes: (sequence of) int, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. - - squeeze: `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1, meaning that the result is guaranteed to - broadcast correctly against the original array. - - weights: data-like or dict, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - a weight equal to one. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values. - - *Parameter example:* - If ``weights={1: w, (2, 0): x}`` then ``w`` must contain - 1-dimensional weights for axis 1 and ``x`` must contain - 2-dimensional weights for axes 2 and 0. This is - equivalent, for example, to ``weights={(1, 2, 0), y}``, - where ``y`` is the outer product of ``w`` and ``x``. If - ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is - equivalent to ``weights=y``. If ``axes=None`` and the - array is 3-dimensional then ``weights={(1, 2, 0), y}`` - is equivalent to ``weights=y.transpose([2, 0, 1])``. - - mtol: number, optional - - {{inplace: `bool`, optional}} - - {{i: deprecated at version 3.0.0}} - - :Returns: + from .collapse_functions import cf_mean as func - `Data` or `None` - The collapsed array. - - .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, - `var` - - **Examples:** - - >>> d = cf.Data([[1, 2, 4], [1, 4, 9]], 'm') - >>> print(d.array) - [[1 2 4] - [1 4 9]] - - >>> d.mean() - - >>> d.mean(squeeze=True) - - >>> d.mean(axes=[0, 1]) - - >>> d.mean(axes=[1, 0]) - - >>> print(d.mean(axes=0).array) - [[1. 3. 6.5]] - >>> print(d.mean(axes=1).array) - [[2.33333333] - [4.66666667]] - >>> d.mean(axes=1, squeeze=True) - - - >>> y = cf.Data([1, 3]) - >>> x = cf.Data([1, 2, 1]) - >>> w = cf.Data.insert_dimension(y, 1) * x - >>> print(w.array) - [[1 2 1] - [3 6 3]] - - >>> d.mean(weights=w) - - >>> d.mean(weights={(0, 1): w}) - - >>> d.mean(axes=[0, 1], weights={(0, 1): w}) - - >>> d.mean(axes=[1, 0], weights={(0, 1): w}) - - >>> d.mean(axes=(0, 1), weights={1: x, 0: y}) - - - >>> d.mean(axes=1, weights=w) - - >>> d.mean(axes=1, weights=x) - - >>> d.mean(axes=1, weights={1: x}) - - >>> d.mean(axes=1, weights={(0, 1): w}) - - >>> d.mean(axes=1, weights={0: y, (1,): x}) - - - >>> d.mean(axes=1) - - >>> d.mean(axes=1, weights={0: y}) - - - >>> e = cf.Data(numpy.arange(24).reshape(3, 2, 4)) - >>> print(e.array) - [[[ 0 1 2 3] - [ 4 5 6 7]] - [[ 8 9 10 11] - [12 13 14 15]] - [[16 17 18 19] - [20 21 22 23]]] - - >>> e.mean(axes=[0, 2]) - - >>> f = e.mean(axes=[0, 2], squeeze=True) - >>> f - - >>> f.shape - (2,) - >>> print(e.mean(axes=[0, 1]).array) - [[[10. 11. 12. 13.]]] - >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) - [[[11. 12. 13. 14.]]] - - >>> e[0, 0] = cf.masked - >>> e[-1, -1] = cf.masked - >>> e[..., 2] = cf.masked - >>> print(e.array) - [[[-- -- -- --] - [4 5 -- 7]] - [[8 9 -- 11] - [12 13 -- 15]] - [[16 17 -- 19] - [-- -- -- --]]] - - >>> e.mean() - - >>> print(e.mean(axes=[0, 1]).array) - [[[10.0 11.0 -- 13.0]]] - >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) - [[[9.666666666666666 10.666666666666666 -- 12.666666666666666]]] - - """ - return self._collapse( - mean_f, - mean_fpartial, - mean_ffinalise, - axes=axes, - squeeze=squeeze, - weights=weights, - mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, - ) + d = _inplace_enabled_define_and_cleanup(self) + return _collapse( d, func, axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol ) + +# @_deprecated_kwarg_check("i") +# def mean( +# self, +# axes=None, +# squeeze=False, +# mtol=1, +# weights=None, +# inplace=False, +# i=False, +# _preserve_partitions=False, +# ): +# """Collapse axes with their mean. +# +# The mean is unweighted by default, but may be weighted (see the +# *weights* parameter). +# +# Missing data array elements and their corresponding weights +# are omitted from the calculation. +# +# :Parameters: +# +# axes: (sequence of) int, optional +# The axes to be collapsed. By default flattened input is +# used. Each axis is identified by its integer position. No +# axes are collapsed if *axes* is an empty sequence. +# +# squeeze: `bool`, optional +# If True then collapsed axes are removed. By default the +# axes which are collapsed are left in the result as axes +# with size 1, meaning that the result is guaranteed to +# broadcast correctly against the original array. +# +# weights: data-like or dict, optional +# Weights associated with values of the array. By default +# all non-missing elements of the array are assumed to have +# a weight equal to one. If *weights* is a data-like object +# then it must have either the same shape as the array or, +# if that is not the case, the same shape as the axes being +# collapsed. If *weights* is a dictionary then each key is +# axes of the array (an int or tuple of ints) with a +# corresponding data-like value of weights for those +# axes. In this case, the implied weights array is the outer +# product of the dictionary's values. +# +# *Parameter example:* +# If ``weights={1: w, (2, 0): x}`` then ``w`` must contain +# 1-dimensional weights for axis 1 and ``x`` must contain +# 2-dimensional weights for axes 2 and 0. This is +# equivalent, for example, to ``weights={(1, 2, 0), y}``, +# where ``y`` is the outer product of ``w`` and ``x``. If +# ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is +# equivalent to ``weights=y``. If ``axes=None`` and the +# array is 3-dimensional then ``weights={(1, 2, 0), y}`` +# is equivalent to ``weights=y.transpose([2, 0, 1])``. +# +# mtol: number, optional +# +# {{inplace: `bool`, optional}} +# +# {{i: deprecated at version 3.0.0}} +# +# :Returns: +# +# `Data` or `None` +# The collapsed array. +# +# .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, +# `var` +# +# **Examples:** +# +# >>> d = cf.Data([[1, 2, 4], [1, 4, 9]], 'm') +# >>> print(d.array) +# [[1 2 4] +# [1 4 9]] +# +# >>> d.mean() +# +# >>> d.mean(squeeze=True) +# +# >>> d.mean(axes=[0, 1]) +# +# >>> d.mean(axes=[1, 0]) +# +# >>> print(d.mean(axes=0).array) +# [[1. 3. 6.5]] +# >>> print(d.mean(axes=1).array) +# [[2.33333333] +# [4.66666667]] +# >>> d.mean(axes=1, squeeze=True) +# +# +# >>> y = cf.Data([1, 3]) +# >>> x = cf.Data([1, 2, 1]) +# >>> w = cf.Data.insert_dimension(y, 1) * x +# >>> print(w.array) +# [[1 2 1] +# [3 6 3]] +# +# >>> d.mean(weights=w) +# +# >>> d.mean(weights={(0, 1): w}) +# +# >>> d.mean(axes=[0, 1], weights={(0, 1): w}) +# +# >>> d.mean(axes=[1, 0], weights={(0, 1): w}) +# +# >>> d.mean(axes=(0, 1), weights={1: x, 0: y}) +# +# +# >>> d.mean(axes=1, weights=w) +# +# >>> d.mean(axes=1, weights=x) +# +# >>> d.mean(axes=1, weights={1: x}) +# +# >>> d.mean(axes=1, weights={(0, 1): w}) +# +# >>> d.mean(axes=1, weights={0: y, (1,): x}) +# +# +# >>> d.mean(axes=1) +# +# >>> d.mean(axes=1, weights={0: y}) +# +# +# >>> e = cf.Data(numpy.arange(24).reshape(3, 2, 4)) +# >>> print(e.array) +# [[[ 0 1 2 3] +# [ 4 5 6 7]] +# [[ 8 9 10 11] +# [12 13 14 15]] +# [[16 17 18 19] +# [20 21 22 23]]] +# +# >>> e.mean(axes=[0, 2]) +# +# >>> f = e.mean(axes=[0, 2], squeeze=True) +# >>> f +# +# >>> f.shape +# (2,) +# >>> print(e.mean(axes=[0, 1]).array) +# [[[10. 11. 12. 13.]]] +# >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) +# [[[11. 12. 13. 14.]]] +# +# >>> e[0, 0] = cf.masked +# >>> e[-1, -1] = cf.masked +# >>> e[..., 2] = cf.masked +# >>> print(e.array) +# [[[-- -- -- --] +# [4 5 -- 7]] +# [[8 9 -- 11] +# [12 13 -- 15]] +# [[16 17 -- 19] +# [-- -- -- --]]] +# +# >>> e.mean() +# +# >>> print(e.mean(axes=[0, 1]).array) +# [[[10.0 11.0 -- 13.0]]] +# >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) +# [[[9.666666666666666 10.666666666666666 -- 12.666666666666666]]] +# +# """ +# return self._collapse( +# mean_f, +# mean_fpartial, +# mean_ffinalise, +# axes=axes, +# squeeze=squeeze, +# weights=weights, +# mtol=mtol, +# inplace=inplace, +# _preserve_partitions=_preserve_partitions, +# ) def mean_absolute_value( self, @@ -10301,6 +10321,23 @@ def override_calendar(self, calendar, inplace=False, i=False): return d + def to_dask_array(self): + """Store the data array on disk. + + There is no change to partition's whose sub-arrays are already on + disk. + + :Returns: + + `None` + + **Examples:** + + >>> d.to_disk() + + """ + return self._get_dask() + def to_disk(self): """Store the data array on disk. @@ -12785,6 +12822,7 @@ def roll(self, axis, shift, inplace=False, i=False): def sum( self, axes=None, + weights=None, squeeze=False, mtol=1, inplace=False, @@ -12795,17 +12833,9 @@ def sum( from .collapse_functions import cf_sum d = _inplace_enabled_define_and_cleanup(self) - dx = d._get_dask() - dx = cf_sum( - dx, - axis=axes, - keepdims=not squeeze, - split_every=split_every, - mtol=mtol, - ) - d._set_dask(dx, reset_mask_hardness=True) - - return d + return _collapse( d, cf_sum, axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol ) def sum_of_squares( self, @@ -13636,13 +13666,10 @@ def _collapse( ): """TODODASK.""" dx = d._get_dask() - - if weights is not None: - weights = da.asanyarray(dask_compatible(weights)) - dx = collapse_func( - (dx, weights), + dx, axis=axis, + weights=weights, keepdims=keepdims, split_every=split_every, mtol=mtol, diff --git a/cf/data/utils.py b/cf/data/utils.py index cda7f77c6e..9a1a144ab3 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -422,7 +422,7 @@ def dask_compatible(a): except AttributeError: return a - + def scalar_masked_array(dtype=float): """Return a scalar masked array. From b65b63e7835061cb57dcd09087f06d15b7e6fbab Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 13 Mar 2022 17:15:34 +0000 Subject: [PATCH 04/37] dev --- cf/data/collapse_functions.py | 558 ++++++++++++++++++---------------- cf/data/data.py | 518 +++++++++++++++++-------------- cf/data/utils.py | 2 +- 3 files changed, 577 insertions(+), 501 deletions(-) diff --git a/cf/data/collapse_functions.py b/cf/data/collapse_functions.py index 3420170bfb..9167e65fe8 100644 --- a/cf/data/collapse_functions.py +++ b/cf/data/collapse_functions.py @@ -4,21 +4,14 @@ from operator import mul import numpy as np -from dask.array.reductions import reduction from numpy import abs as numpy_abs -from numpy import amax as numpy_amax -from numpy import amin as numpy_amin from numpy import array as numpy_array from numpy import asanyarray as numpy_asanyarray from numpy import average as numpy_average -from numpy import bool_ as numpy_bool_ -from numpy import empty as numpy_empty from numpy import expand_dims as numpy_expand_dims -from numpy import integer as numpy_integer from numpy import maximum as numpy_maximum from numpy import minimum as numpy_minimum from numpy import ndim as numpy_ndim -from numpy import sum as numpy_sum from numpy.ma import array as numpy_ma_array from numpy.ma import average as numpy_ma_average from numpy.ma import expand_dims as numpy_ma_expand_dims @@ -29,7 +22,32 @@ from ..functions import broadcast_array -def asanyarray(*args): +#def asanyarray(*args): +# """Return every input array as an numpy ndarray, or a subclass of. +# +# :Parameters: +# +# args: sequence of array-like input objects +# +# :Returns: +# +# `list` +# The input objects left as, else converted to, `numpy.ndarray` +# +# """ +# out = [] +# for x in args: +# if x is not None and not np.ndim(x): +# # Make sure that we have a numpy array (as opposed to, e.g. a +# # numpy.float64) +# out.append(np.asanyarray(x)) +# else: +# out.append(x) +# +# return out + + +def asanyarray(x): """Return every input array as an numpy ndarray, or a subclass of. :Parameters: @@ -42,16 +60,12 @@ def asanyarray(*args): The input objects left as, else converted to, `numpy.ndarray` """ - out = [] - for x in args: - if x is not None and not np.ndim(x): - # Make sure that we have a numpy array (as opposed to, e.g. a - # numpy.float64) - out.append(np.asanyarray(x)) - else: - out.append(x) + if not np.ndim(x): + # Make sure that we have a numpy array (e.g. as opposed to a + # numpy.float64) + return np.asanyarray(x) - return out + return x def psum(x, y): @@ -212,7 +226,6 @@ def mask_where_too_few_values(Nmin, N, x): strictly less than *Nmin*. """ - print(" N.min() =", N.min(), Nmin) if N.min() < Nmin: mask = N < Nmin N = numpy_ma_array(N, mask=mask, copy=False, shrink=False) @@ -243,7 +256,7 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): Nmax = reduce(mul, [original_shape[i] for i in axis], 1) x = np.ma.masked_where(N < (1 - mtol) * Nmax, x, copy=False) - return asanyarray(x)[0] + return asanyarray(x) def double_precision(a): @@ -272,8 +285,6 @@ def double_precision(a): return a.astype(newtype, copy=False) - - # -------------------------------------------------------------------- # mean # -------------------------------------------------------------------- @@ -349,7 +360,7 @@ def mean_fpartial(out, out1=None, group=False): # Convert the partition average to a partition sum avg *= sw else: - # Combine this partition with existing parital combination + # Combine this partition with existing partial combination N1, avg1, sw1 = out1 # Convert the partition average to a partition sum @@ -481,7 +492,6 @@ def root_mean_square_ffinalise(out, sub_samples=None): return asanyarray(N, avg) - def sum_f(a, axis=None, weights=None, masked=False): """Return the sum of an array or the sum along an axis. @@ -981,59 +991,80 @@ def sd_ffinalise(out, sub_samples=None): from dask.array import chunk -from dask.array.core import _concatenate2, broadcast_to -from dask.array.reductions import divide -from dask.array.ufunc import multiply +from dask.array.core import _concatenate2 +from dask.array.reductions import divide, numel, reduction +from dask.core import flatten from dask.utils import deepmap # Apply function inside nested lists -def sum_of_weights(x, weights=None, N=None, squared=False, **kwargs): - """TODO""" + +def sum_of_weights( + x, weights=None, dtype="f8", N=None, squared=False, **kwargs +): + """TODO.""" if weights is None: if N is None: - N = cf_sample_size_chunk(x, **kwargs)["N"] - - sw = N - else: - if squared: - weights = multiply(weights, weights, dtype=float) + N = cf_sample_size_chunk(x, dtype=dtype, **kwargs)["N"] - if np.ma.is_masked(x): - weights = np.ma.masked_where(x.mask, weights) + return N - sw = weights.sum(**kwargs) + if squared: + weights = np.multiply(weights, weights, dtype=dtype) - return sw + if np.ma.is_masked(x): + weights = np.ma.masked_where(x.mask, weights) -def combine_sample_sizes(pairs, axis, **kwargs): - # Create a nested list of N and recursively concatenate it - # along the specified axes - return combine_arrays(pairs, "N", chunk.sum, axis, int, False, **kwargs) + return chunk.sum(weights, dtype=dtype, **kwargs) def combine_arrays( pairs, key, func, axis, dtype, computing_meta=False, **kwargs ): + """rename *key*""" # Create a nested list of N and recursively concatenate it # along the specified axes - x = deepmap(lambda pair: pair[key], pairs) if not computing_meta else pairs + if isinstance(key, str): + dm_func = lambda pair: pair[key] + else: + dm_func = key + + x = deepmap(dm_func, pairs) if not computing_meta else pairs if dtype: kwargs["dtype"] = dtype - x = func(_concatenate2(x, axes=axis), axis=axis, **kwargs) - return x + return func(_concatenate2(x, axes=axis), axis=axis, **kwargs) def sum_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): - # Create a nested list of N and recursively concatenate it - # along the specified axes - return combine_arrays(pairs, key, chunk.sum, axis, dtype, - computing_meta, **kwargs) + """Alias of `combine_arrays` with ``func=chunk.sum``.""" + return combine_arrays( + pairs, key, chunk.sum, axis, dtype, computing_meta, **kwargs + ) + + +def max_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``func=chunk.max``.""" + return combine_arrays( + pairs, key, chunk.max, axis, dtype, computing_meta, **kwargs + ) + + +def min_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``func=chunk.min``.""" + return combine_arrays( + pairs, key, chunk.min, axis, dtype, computing_meta, **kwargs + ) + + +def sum_sample_sizes(pairs, axis, **kwargs): + """Alias of `combine_arrays` with ``key="N", func=chunk.sum, + computing_meta=False, dtype="i8"``.""" + return combine_arrays(pairs, "N", chunk.sum, axis, "i8", False, **kwargs) # -------------------------------------------------------------------- # mean # -------------------------------------------------------------------- -def cf_mean_chunk(x, weights=None, dtype=float, computing_meta=False, **kwargs): +def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): """Find the max of an array.""" if computing_meta: return x @@ -1044,10 +1075,11 @@ def cf_mean_chunk(x, weights=None, dtype=float, computing_meta=False, **kwargs): sw = d["N"] else: sw = chunk.sum(weights, **kwargs) - + d["sw"] = sw return d + def cf_mean_combine( pairs, axis=None, @@ -1058,22 +1090,23 @@ def cf_mean_combine( """Apply the function to the data in a nested list of arrays.""" if not isinstance(pairs, list): pairs = [pairs] - + d = {} - for key in ("sum", "sw"): + for key in ("sum", "sw"): d[key] = sum_arrays(pairs, key, axis, dtype, computing_meta, **kwargs) if computing_meta: - return x - - d["N"] = combine_sample_sizes(pairs, axis, **kwargs) + return d[key] + + d["N"] = sum_sample_sizes(pairs, axis, **kwargs) return d + def cf_mean_agg( pairs, axis=None, dtype="f8", computing_meta=False, - mtol=1, + mtol=None, original_shape=None, **kwargs, ): @@ -1082,14 +1115,15 @@ def cf_mean_agg( d = cf_mean_combine(pairs, axis, computing_meta, **kwargs) if computing_meta: return d - - x = divide(d["sum"], d["sw"]) # dtype? + + x = divide(d["sum"], d["sw"]) # dtype? x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) return x -def cf_mean(a, axis=None, weights=None, keepdims=False, mtol=1, - split_every=None): +def cf_mean( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None +): """TODODASK.""" dtype = float return reduction( @@ -1104,7 +1138,7 @@ def cf_mean(a, axis=None, weights=None, keepdims=False, mtol=1, out=None, concatenate=False, meta=np.array((), dtype=dtype), - weights=weights + weights=weights, ) @@ -1134,15 +1168,13 @@ def cf_max_combine( # Create a nested list of maxima and recursively concatenate it # along the specified axes - m = combine_arrays( - pairs, "max", chunk.max, axis, None, computing_meta, **kwargs - ) + m = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) if computing_meta: return m return { "max": m, - "N": combine_sample_sizes(pairs, axis, **kwargs), + "N": sum_sample_sizes(pairs, axis, **kwargs), } @@ -1150,7 +1182,7 @@ def cf_max_agg( pairs, axis=None, computing_meta=False, - mtol=1, + mtol=None, original_shape=None, **kwargs, ): @@ -1158,14 +1190,13 @@ def cf_max_agg( d = cf_max_combine(pairs, axis, computing_meta, **kwargs) if computing_meta: return d - x = d["max"] x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) return x -def cf_max(a, axis=None, keepdims=False, mtol=1, split_every=None): +def cf_max(a, axis=None, keepdims=False, mtol=None, split_every=None): """TODODASK.""" dtype = a.dtype return reduction( @@ -1183,7 +1214,6 @@ def cf_max(a, axis=None, keepdims=False, mtol=1, split_every=None): ) - # -------------------------------------------------------------------- # maximum_absolute_value # -------------------------------------------------------------------- @@ -1210,13 +1240,14 @@ def cf_max_abs_chunk(x, computing_meta=False, **kwargs): """ if computing_meta: return x - + return cf_max_chunk(np.abs(x), **kwargs) cf_max_abs_combine = cf_max_combine cf_max_abs_agg = cf_max_agg + # -------------------------------------------------------------------- # min # -------------------------------------------------------------------- @@ -1225,7 +1256,7 @@ def cf_min_chunk(x, computing_meta=False, **kwargs): if computing_meta: return x - return { + return { "min": chunk.min(x, **kwargs), "N": cf_sample_size_chunk(x, **kwargs)["N"], } @@ -1243,15 +1274,13 @@ def cf_min_combine( # Create a nested list of maxima and recursively concatenate it # along the specified axes - x = combine_arrays( - pairs, "min", chunk.min, axis, None, computing_meta, **kwargs - ) + x = min_arrays(pairs, "min", axis, None, computing_meta, **kwargs) if computing_meta: - return m + return x return { "min": x, - "N": combine_sample_sizes(pairs, axis, **kwargs), + "N": sum_sample_sizes(pairs, axis, **kwargs), } @@ -1259,7 +1288,7 @@ def cf_min_agg( pairs, axis=None, computing_meta=False, - mtol=1, + mtol=None, original_shape=None, **kwargs, ): @@ -1267,14 +1296,13 @@ def cf_min_agg( d = cf_min_combine(pairs, axis, computing_meta, **kwargs) if computing_meta: return d - + x = d["min"] x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) return x - -def cf_min(a, axis=None, keepdims=False, mtol=1, split_every=None): +def cf_min(a, axis=None, keepdims=False, mtol=None, split_every=None): """TODODASK.""" dtype = a.dtype return reduction( @@ -1291,6 +1319,7 @@ def cf_min(a, axis=None, keepdims=False, mtol=1, split_every=None): meta=np.array((), dtype=dtype), ) + # -------------------------------------------------------------------- # minimum absolute value # -------------------------------------------------------------------- @@ -1324,6 +1353,7 @@ def cf_min_abs_chunk(x, computing_meta=False, **kwargs): cf_min_abs_combine = cf_min_combine cf_min_abs_agg = cf_min_agg + # -------------------------------------------------------------------- # range # -------------------------------------------------------------------- @@ -1334,7 +1364,7 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): d = cf_max_chunk(x, **kwargs) d["min"] = chunk.min(x, **kwargs) - return d + return d def cf_range_combine( @@ -1350,20 +1380,16 @@ def cf_range_combine( # Create a nested list of maxima and recursively concatenate it # along the specified axes - mx = combine_arrays( - pairs, "max", chunk.max, axis, None, computing_meta, **kwargs - ) + mx = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) if computing_meta: return mx - mn = combine_arrays( - pairs, "min", chunk.min, axis, None, computing_meta, **kwargs - ) + mn = min_arrays(pairs, "min", axis, None, computing_meta, **kwargs) return { "max": mx, "min": mn, - "N": combine_sample_sizes(pairs, axis, **kwargs), + "N": sum_sample_sizes(pairs, axis, **kwargs), } @@ -1371,7 +1397,7 @@ def cf_range_agg( pairs, axis=None, computing_meta=False, - mtol=1, + mtol=None, original_shape=None, **kwargs, ): @@ -1379,14 +1405,14 @@ def cf_range_agg( d = cf_range_combine(pairs, axis, computing_meta, **kwargs) if computing_meta: return d - + # Calculate the range x = d["max"] - d["min"] x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) return x -def cf_range(a, axis=None, keepdims=False, mtol=1, split_every=None): +def cf_range(a, axis=None, keepdims=False, mtol=None, split_every=None): """TODODASK.""" dtype = a.dtype return reduction( @@ -1402,6 +1428,8 @@ def cf_range(a, axis=None, keepdims=False, mtol=1, split_every=None): concatenate=False, meta=np.array((), dtype=dtype), ) + + # -------------------------------------------------------------------- # mid-range # -------------------------------------------------------------------- @@ -1414,7 +1442,7 @@ def cf_mid_range_agg( axis=None, dtype="f8", computing_meta=False, - mtol=1, + mtol=None, original_shape=None, **kwargs, ): @@ -1422,7 +1450,6 @@ def cf_mid_range_agg( d = cf_range_combine(pairs, axis, dtype, computing_meta, **kwargs) if computing_meta: return d - # Calculate the mid-range x = divide(d["max"] + d["min"], 2.0, dtype=dtype) @@ -1431,7 +1458,7 @@ def cf_mid_range_agg( def cf_mid_range( - a, axis=None, dtype=float, keepdims=False, mtol=1, split_every=None + a, axis=None, dtype=float, keepdims=False, mtol=None, split_every=None ): """TODODASK.""" dtype = float @@ -1453,19 +1480,17 @@ def cf_mid_range( # -------------------------------------------------------------------- # sample_size # -------------------------------------------------------------------- -def cf_sample_size_chunk(x, dtype=int, computing_meta=False, **kwargs): +def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): if computing_meta: return x if np.ma.isMA(x): - N = chunk.sum(~np.ma.getmaskarray(x), dtype=dtype, **kwargs) - if not np.ndim(N): - N = np.asanyarray(N) + N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) else: - axis = kwargs["axis"] - shape = [1 if i in axis else n for i, n in enumerate(x.shape)] - size = reduce(mul, [n for i, n in enumerate(x.shape) if i in axis], 1) - N = np.full(shape, size, dtype=dtype) + if dtype: + kwargs["dtype"] = dtype + + N = numel(x, **kwargs) return {"N": N} @@ -1479,11 +1504,10 @@ def cf_sample_size_combine( if not isinstance(pairs, list): pairs = [pairs] - x = combine_arrays(pairs, "N", chunk.sum, axis, None, - computing_meta, **kwargs) + x = sum_arrays(pairs, "N", axis, None, computing_meta, **kwargs) if computing_meta: return x - + return {"N": x} @@ -1491,20 +1515,20 @@ def cf_sample_size_agg( pairs, axis=None, computing_meta=False, - mtol=1, + mtol=None, original_shape=None, **kwargs, ): d = cf_sample_size_combine(pairs, axis, computing_meta, **kwargs) if computing_meta: return d - + x = d["N"] x = mask_small_sample_size(x, x, axis, mtol, original_shape) return x -def cf_sample_size(a, axis=None, keepdims=False, mtol=1, split_every=None): +def cf_sample_size(a, axis=None, keepdims=False, mtol=None, split_every=None): """TODODASK.""" dtype = int return reduction( @@ -1522,17 +1546,16 @@ def cf_sample_size(a, axis=None, keepdims=False, mtol=1, split_every=None): ) - # -------------------------------------------------------------------- # sum # -------------------------------------------------------------------- -def cf_sum_chunk(x, weights=None, dtype=float, computing_meta=False, **kwargs): +def cf_sum_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): """Find the max of an array.""" if computing_meta: return x if weights is not None: - x = multiply(x, weights) # sort out dtype=result_dtype) + x = np.multiply(x, weights, dtype=dtype) d = cf_sample_size_chunk(x, **kwargs) d["sum"] = chunk.sum(x, dtype=dtype, **kwargs) @@ -1550,6 +1573,7 @@ def cf_sum_combine( if not isinstance(pairs, list): pairs = [pairs] + # Create a nested list of maxima and recursively concatenate it # along the specified axes x = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) @@ -1558,7 +1582,7 @@ def cf_sum_combine( return { "sum": x, - "N": combine_sample_sizes(pairs, axis, **kwargs), + "N": sum_sample_sizes(pairs, axis, **kwargs), } @@ -1567,7 +1591,7 @@ def cf_sum_agg( axis=None, dtype="f8", computing_meta=False, - mtol=1, + mtol=None, original_shape=None, **kwargs, ): @@ -1582,7 +1606,9 @@ def cf_sum_agg( return x -def cf_sum(a, axis=None, weights=None, keepdims=False, mtol=1, split_every=None): +def cf_sum( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None +): """TODODASK.""" dtype = float return reduction( @@ -1597,13 +1623,16 @@ def cf_sum(a, axis=None, weights=None, keepdims=False, mtol=1, split_every=None) out=None, concatenate=False, meta=np.array((), dtype=dtype), - weights=weights + weights=weights, ) + + # -------------------------------------------------------------------- # variance # -------------------------------------------------------------------- -def cf_var_chunk(x, weights=None, dtype=float, computing_meta=False, - ddof=0, **kwargs): +def cf_var_chunk( + x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs +): """Return a tuple containing metrics relating to the array variance. The tuple is a 7-tuple that contains, in the order given, the @@ -1651,79 +1680,51 @@ def cf_var_chunk(x, weights=None, dtype=float, computing_meta=False, in the above table, in the given order. """ - # Make sure that a is double precision - a = double_precision(a) - - weighted = weights is not None - # ---------------------------------------------------------------- # Methods: # # http://en.wikipedia.org/wiki/Standard_deviation#Population-based_statistics # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance # ---------------------------------------------------------------- + if computing_meta: + return x - # Calculate N = number of data points - # Calculate avg = mean of data - # Calculate V1 = sum of weights d = cf_mean_chunk(x, weights, dtype=dtype, **kwargs) - N = d["N"] + wsum = d["sum"] V1 = d["sw"] - avg = d["avg"] #divide(d"sum"], V1) # dtype + N = d["N"] -# N, avg, V1 = mean_f(a, weights=weights, axis=axis, masked=masked) - - # Calculate V2 = sum of squares of weights - if weights is not None and ddof == 1: - V2 = sum_of_weights(x, weights, N=N, squared=True, **kwargs) - else: - V2 = None - -# if axis is not None and avg.size > 1: -# # We collapsed over a single axis and the array has 2 or more -# # axes, so add an extra size 1 axis to the mean so that -# # broadcasting works when we calculate the variance. -# reshape_avg = True -# if masked: -# expand_dims = numpy_ma_expand_dims -# else: -# expand_dims = numpy_expand_dims -# -# avg = expand_dims(avg, axis) -# else: -# reshape_avg = False - - var = x - avg - var *= var - - if np.ma.isMA(var): - average = np.ma.average - else: - average = np.average - - var = average(var, weights=weights, **kwargs) - -# if reshape_avg: -# shape = avg.shape -# avg = avg.reshape(shape[:axis] + shape[axis + 1 :]) - -# (N, var, avg, V1, V2) = asanyarray(N, var, avg, V1, V2) + # with np.errstate(divide="ignore", invalid="ignore"):?????? + avg = divide(wsum, V1, dtype=dtype) + part = x - avg + part *= part + if weights is not None: + part = part * weights - + part = chunk.sum(part, dtype=dtype, **kwargs) + part = part + avg * wsum - return {"var": var, - "avg": avg, - "N": N, - "V1": V1, - "V2": V2, -# "ddof": ddof, -# "weighted": weighted, + d = { + "part": part, + "wsum": wsum, + "N": N, + "V1": V1, } + if weights is not None and ddof == 1: + d["V2"] = sum_of_weights(x, weights, squared=True, **kwargs) + + return d -def cf_var_combine(out, out1=None, group=False): +def cf_var_combine( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + **kwargs, +): """Return a tuple of partial metrics relating to the array variance. The tuple is a 7-tuple that contains, in the order given, the @@ -1750,58 +1751,118 @@ def cf_var_combine(out, out1=None, group=False): For further information, see: https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics + """ + d = {} + + weighted = "V2" in flatten(pairs) + + keys = ("part", "wsum") + if weighted: + keys += ("V1", "V2") + + for key in keys: + d[key] = sum_arrays(pairs, key, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d[key] + + d["N"] = sum_sample_sizes(pairs, axis, **kwargs) + + if not weighted: + d["V1"] = d["N"] + + return d + + +def cf_var_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + ddof=None, + **kwargs, +): + """Calculate the variance of the array and return it with the sample + size. + + Also mask out any values derived from a too-small sample size. + :Parameters: out: 7-`tuple` - out1: 7-`tuple`, optional + sub_samples: optional :Returns: - 7-`tuple` - Tuple containing the value of the statistical metrics described - in the above table, in the given order. + 2-`tuple` of `numpy.ndarray` + The sample size and the variance. """ - (N, var, avg, V1, V2, ddof, weighted) = out + d = cf_var_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d - if out1 is None and not group: - # ------------------------------------------------------------ - # var = V1(var+avg**2) - # avg = V1*avg = unweighted partial sum - # ------------------------------------------------------------ - var += avg * avg - var *= V1 - avg *= V1 - else: - # ------------------------------------------------------------ - # var = var + V1b(varb+avgb**2) - # avg = avg + V1b*avgb - # V1 = V1 + V1b - # V2 = V2 + V2b - # ------------------------------------------------------------ - (Nb, varb, avgb, V1b, V2b, ddof, weighted) = out1 + V1 = d["V1"] + V2 = d.get("V2") + weighted = V2 is not None - N = psum(N, Nb) + if ddof == 0: # intended equality with zero + # Weighted or unweighted variance with ddof=0 + f = 1 / V1 + elif not weighted: + # Unweighted variance with any non-zero value of ddof + f = 1 / (V1 - ddof) + elif ddof == 1: + # Weighted variance with ddof=1 + # + # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance + # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights + f = V1 / (V1 * V1 - V2) + else: + raise ValueError( + "Can only calculate a weighted variance with a ddof=0 or " + f"ddof=1: Got {ddof!r}" + ) - if not group: - varb += avgb * avgb - varb *= V1b - avgb *= V1b + wsum = d["wsum"] + var = f * (d["part"] - wsum * wsum / V1) - var = psum(var, varb) - avg = psum(avg, avgb) - V1 = psum(V1, V1b) + var = mask_small_sample_size(var, d["N"], axis, mtol, original_shape) + return var - if weighted and ddof == 1: - V2 = psum(V2, V2b) - # --- End: if - (N, var, avg, V1, V2) = asanyarray(N, var, avg, V1, V2) +def cf_var( a, axis=None, weights=None, keepdims=False, mtol=None, + ddof=None, split_every=None ): + """TODODASK.""" + dtype = float + return reduction( + a, + partial(cf_var_chunk, ddof=ddof), + partial(cf_var_agg, mtol=mtol, ddof=ddof, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_var_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) - return (N, var, avg, V1, V2, ddof, weighted) -def var_ffinalise(out, sub_samples=None): +def cf_sd_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + ddof=None, + **kwargs, +): """Calculate the variance of the array and return it with the sample size. @@ -1819,69 +1880,34 @@ def var_ffinalise(out, sub_samples=None): The sample size and the variance. """ - (N, var, avg, V1, V2, ddof, weighted) = out - - N, var = mask_where_too_few_values(max(2, ddof + 1), N, var) - N, V1 = mask_where_too_few_values(max(2, ddof + 1), N, V1) - if V2 is not None: - N, V2 = mask_where_too_few_values(max(2, ddof + 1), N, V2) + d = cf_var_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d - if sub_samples: - # ---------------------------------------------------------------- - # The global biased variance = {[SUM(pV1(pv+pm**2)]/V1} - m**2 - # - # where pV1 = partial sum of weights - # pv = partial biased variance - # pm = partial mean - # V1 = global sum of weights - # m = global mean - # - # Currently: var = SUM(pV1(pv+pm**2) - # avg = V1*m - # - # https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics - # - # For the general case of M non-overlapping data sets, X_{1} - # through X_{M}, and the aggregate data set X=\bigcup_{i}X_{i} - # we have the unweighted mean and variance is: - # - # \mu_{X}={\frac{1}{\sum_{i}{N_{X_{i}}}}}\left(\sum_{i}{N_{X_{i}}\mu_{X_{i}}}\right) - # - # var_{X}={{\frac{1}{\sum_{i}{N_{X_{i}}-ddof}}}\left(\sum_{i}{\left[(N_{X_{i}}-1)\sigma_{X_{i}}^{2}+N_{X_{i}}\mu_{X_{i}}^{2}\right]}-\left[\sum_{i}{N_{X_{i}}}\right]\mu_{X}^{2}\right)} - # - # ---------------------------------------------------------------- - avg /= V1 - avg *= avg - var /= V1 - var -= avg + V2 = d.get("V2") + weighted = V2 is not None - # ---------------------------------------------------------------- - # var is now the biased global variance - # ---------------------------------------------------------------- - if not weighted: - if ddof: - # The unweighted variance with N-ddof degrees of freedom is - # [V1/(V1-ddof)]*var. In this case V1 equals the sample size, - # N. ddof=1 provides an unbiased estimator of the variance of - # a hypothetical infinite population. - V1 /= V1 - ddof - var *= V1 + V1 = d["V1"] + if ddof == 0: # intended equality with zero + # Weighted or unweighted variance with ddof=0 + f = 1 / V1 + elif not weighted: + # Unweighted variance with any non-zero value of ddof + f = 1 / (V1 - ddof) elif ddof == 1: - # Calculate the weighted unbiased variance. The unbiased - # variance weighted with _reliability_ weights is - # [V1**2/(V1**2-V2)]*var. + # Weighted variance with ddof=1 # # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance - V1 **= 2 - var *= V1 - V1 -= V2 - var /= V1 - elif ddof: + # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights + f = V1 / (V1 * V1 - V2) + else: raise ValueError( - "Can only calculate a weighted variance with a delta degrees " - "of freedom (ddof) of 0 or 1: Got {}".format(ddof) + "Can only calculate a weighted variance with a ddof=0 or " + f"ddof=1: Got {ddof!r}" ) - return asanyarray(N, var) - + wsum = d["sum"] + var = f * (d["part"] - (wsum * wsum) / V1) + var = mask_small_sample_size(var, d["N"], axis, mtol, original_shape) + return var diff --git a/cf/data/data.py b/cf/data/data.py index c2d31a3b61..e8eb97a1cc 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -14,10 +14,10 @@ import numpy as np from dask.array import Array from dask.array.core import normalize_chunks +from dask.array.routines import result_type from dask.base import is_dask_collection, tokenize from dask.core import flatten from dask.highlevelgraph import HighLevelGraph -from dask.array.routines import result_type from numpy.testing import suppress_warnings as numpy_testing_suppress_warnings from ..cfdatetime import dt as cf_dt @@ -7900,7 +7900,7 @@ def minimum_absolute_value( def mean( self, axes=None, - weights=None, + weights=None, squeeze=False, mtol=1, inplace=False, @@ -7910,187 +7910,193 @@ def mean( from .collapse_functions import cf_mean as func d = _inplace_enabled_define_and_cleanup(self) - return _collapse( d, func, axis=axes, weights=weights, - keepdims=not squeeze, - split_every=split_every, mtol=mtol ) + return _collapse( + d, + func, + axis=axes, + weights=weights, + keepdims=not squeeze, + split_every=split_every, + mtol=mtol, + ) -# @_deprecated_kwarg_check("i") -# def mean( -# self, -# axes=None, -# squeeze=False, -# mtol=1, -# weights=None, -# inplace=False, -# i=False, -# _preserve_partitions=False, -# ): -# """Collapse axes with their mean. -# -# The mean is unweighted by default, but may be weighted (see the -# *weights* parameter). -# -# Missing data array elements and their corresponding weights -# are omitted from the calculation. -# -# :Parameters: -# -# axes: (sequence of) int, optional -# The axes to be collapsed. By default flattened input is -# used. Each axis is identified by its integer position. No -# axes are collapsed if *axes* is an empty sequence. -# -# squeeze: `bool`, optional -# If True then collapsed axes are removed. By default the -# axes which are collapsed are left in the result as axes -# with size 1, meaning that the result is guaranteed to -# broadcast correctly against the original array. -# -# weights: data-like or dict, optional -# Weights associated with values of the array. By default -# all non-missing elements of the array are assumed to have -# a weight equal to one. If *weights* is a data-like object -# then it must have either the same shape as the array or, -# if that is not the case, the same shape as the axes being -# collapsed. If *weights* is a dictionary then each key is -# axes of the array (an int or tuple of ints) with a -# corresponding data-like value of weights for those -# axes. In this case, the implied weights array is the outer -# product of the dictionary's values. -# -# *Parameter example:* -# If ``weights={1: w, (2, 0): x}`` then ``w`` must contain -# 1-dimensional weights for axis 1 and ``x`` must contain -# 2-dimensional weights for axes 2 and 0. This is -# equivalent, for example, to ``weights={(1, 2, 0), y}``, -# where ``y`` is the outer product of ``w`` and ``x``. If -# ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is -# equivalent to ``weights=y``. If ``axes=None`` and the -# array is 3-dimensional then ``weights={(1, 2, 0), y}`` -# is equivalent to ``weights=y.transpose([2, 0, 1])``. -# -# mtol: number, optional -# -# {{inplace: `bool`, optional}} -# -# {{i: deprecated at version 3.0.0}} -# -# :Returns: -# -# `Data` or `None` -# The collapsed array. -# -# .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, -# `var` -# -# **Examples:** -# -# >>> d = cf.Data([[1, 2, 4], [1, 4, 9]], 'm') -# >>> print(d.array) -# [[1 2 4] -# [1 4 9]] -# -# >>> d.mean() -# -# >>> d.mean(squeeze=True) -# -# >>> d.mean(axes=[0, 1]) -# -# >>> d.mean(axes=[1, 0]) -# -# >>> print(d.mean(axes=0).array) -# [[1. 3. 6.5]] -# >>> print(d.mean(axes=1).array) -# [[2.33333333] -# [4.66666667]] -# >>> d.mean(axes=1, squeeze=True) -# -# -# >>> y = cf.Data([1, 3]) -# >>> x = cf.Data([1, 2, 1]) -# >>> w = cf.Data.insert_dimension(y, 1) * x -# >>> print(w.array) -# [[1 2 1] -# [3 6 3]] -# -# >>> d.mean(weights=w) -# -# >>> d.mean(weights={(0, 1): w}) -# -# >>> d.mean(axes=[0, 1], weights={(0, 1): w}) -# -# >>> d.mean(axes=[1, 0], weights={(0, 1): w}) -# -# >>> d.mean(axes=(0, 1), weights={1: x, 0: y}) -# -# -# >>> d.mean(axes=1, weights=w) -# -# >>> d.mean(axes=1, weights=x) -# -# >>> d.mean(axes=1, weights={1: x}) -# -# >>> d.mean(axes=1, weights={(0, 1): w}) -# -# >>> d.mean(axes=1, weights={0: y, (1,): x}) -# -# -# >>> d.mean(axes=1) -# -# >>> d.mean(axes=1, weights={0: y}) -# -# -# >>> e = cf.Data(numpy.arange(24).reshape(3, 2, 4)) -# >>> print(e.array) -# [[[ 0 1 2 3] -# [ 4 5 6 7]] -# [[ 8 9 10 11] -# [12 13 14 15]] -# [[16 17 18 19] -# [20 21 22 23]]] -# -# >>> e.mean(axes=[0, 2]) -# -# >>> f = e.mean(axes=[0, 2], squeeze=True) -# >>> f -# -# >>> f.shape -# (2,) -# >>> print(e.mean(axes=[0, 1]).array) -# [[[10. 11. 12. 13.]]] -# >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) -# [[[11. 12. 13. 14.]]] -# -# >>> e[0, 0] = cf.masked -# >>> e[-1, -1] = cf.masked -# >>> e[..., 2] = cf.masked -# >>> print(e.array) -# [[[-- -- -- --] -# [4 5 -- 7]] -# [[8 9 -- 11] -# [12 13 -- 15]] -# [[16 17 -- 19] -# [-- -- -- --]]] -# -# >>> e.mean() -# -# >>> print(e.mean(axes=[0, 1]).array) -# [[[10.0 11.0 -- 13.0]]] -# >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) -# [[[9.666666666666666 10.666666666666666 -- 12.666666666666666]]] -# -# """ -# return self._collapse( -# mean_f, -# mean_fpartial, -# mean_ffinalise, -# axes=axes, -# squeeze=squeeze, -# weights=weights, -# mtol=mtol, -# inplace=inplace, -# _preserve_partitions=_preserve_partitions, -# ) + # @_deprecated_kwarg_check("i") + # def mean( + # self, + # axes=None, + # squeeze=False, + # mtol=1, + # weights=None, + # inplace=False, + # i=False, + # _preserve_partitions=False, + # ): + # """Collapse axes with their mean. + # + # The mean is unweighted by default, but may be weighted (see the + # *weights* parameter). + # + # Missing data array elements and their corresponding weights + # are omitted from the calculation. + # + # :Parameters: + # + # axes: (sequence of) int, optional + # The axes to be collapsed. By default flattened input is + # used. Each axis is identified by its integer position. No + # axes are collapsed if *axes* is an empty sequence. + # + # squeeze: `bool`, optional + # If True then collapsed axes are removed. By default the + # axes which are collapsed are left in the result as axes + # with size 1, meaning that the result is guaranteed to + # broadcast correctly against the original array. + # + # weights: data-like or dict, optional + # Weights associated with values of the array. By default + # all non-missing elements of the array are assumed to have + # a weight equal to one. If *weights* is a data-like object + # then it must have either the same shape as the array or, + # if that is not the case, the same shape as the axes being + # collapsed. If *weights* is a dictionary then each key is + # axes of the array (an int or tuple of ints) with a + # corresponding data-like value of weights for those + # axes. In this case, the implied weights array is the outer + # product of the dictionary's values. + # + # *Parameter example:* + # If ``weights={1: w, (2, 0): x}`` then ``w`` must contain + # 1-dimensional weights for axis 1 and ``x`` must contain + # 2-dimensional weights for axes 2 and 0. This is + # equivalent, for example, to ``weights={(1, 2, 0), y}``, + # where ``y`` is the outer product of ``w`` and ``x``. If + # ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is + # equivalent to ``weights=y``. If ``axes=None`` and the + # array is 3-dimensional then ``weights={(1, 2, 0), y}`` + # is equivalent to ``weights=y.transpose([2, 0, 1])``. + # + # mtol: number, optional + # + # {{inplace: `bool`, optional}} + # + # {{i: deprecated at version 3.0.0}} + # + # :Returns: + # + # `Data` or `None` + # The collapsed array. + # + # .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, + # `var` + # + # **Examples:** + # + # >>> d = cf.Data([[1, 2, 4], [1, 4, 9]], 'm') + # >>> print(d.array) + # [[1 2 4] + # [1 4 9]] + # + # >>> d.mean() + # + # >>> d.mean(squeeze=True) + # + # >>> d.mean(axes=[0, 1]) + # + # >>> d.mean(axes=[1, 0]) + # + # >>> print(d.mean(axes=0).array) + # [[1. 3. 6.5]] + # >>> print(d.mean(axes=1).array) + # [[2.33333333] + # [4.66666667]] + # >>> d.mean(axes=1, squeeze=True) + # + # + # >>> y = cf.Data([1, 3]) + # >>> x = cf.Data([1, 2, 1]) + # >>> w = cf.Data.insert_dimension(y, 1) * x + # >>> print(w.array) + # [[1 2 1] + # [3 6 3]] + # + # >>> d.mean(weights=w) + # + # >>> d.mean(weights={(0, 1): w}) + # + # >>> d.mean(axes=[0, 1], weights={(0, 1): w}) + # + # >>> d.mean(axes=[1, 0], weights={(0, 1): w}) + # + # >>> d.mean(axes=(0, 1), weights={1: x, 0: y}) + # + # + # >>> d.mean(axes=1, weights=w) + # + # >>> d.mean(axes=1, weights=x) + # + # >>> d.mean(axes=1, weights={1: x}) + # + # >>> d.mean(axes=1, weights={(0, 1): w}) + # + # >>> d.mean(axes=1, weights={0: y, (1,): x}) + # + # + # >>> d.mean(axes=1) + # + # >>> d.mean(axes=1, weights={0: y}) + # + # + # >>> e = cf.Data(numpy.arange(24).reshape(3, 2, 4)) + # >>> print(e.array) + # [[[ 0 1 2 3] + # [ 4 5 6 7]] + # [[ 8 9 10 11] + # [12 13 14 15]] + # [[16 17 18 19] + # [20 21 22 23]]] + # + # >>> e.mean(axes=[0, 2]) + # + # >>> f = e.mean(axes=[0, 2], squeeze=True) + # >>> f + # + # >>> f.shape + # (2,) + # >>> print(e.mean(axes=[0, 1]).array) + # [[[10. 11. 12. 13.]]] + # >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) + # [[[11. 12. 13. 14.]]] + # + # >>> e[0, 0] = cf.masked + # >>> e[-1, -1] = cf.masked + # >>> e[..., 2] = cf.masked + # >>> print(e.array) + # [[[-- -- -- --] + # [4 5 -- 7]] + # [[8 9 -- 11] + # [12 13 -- 15]] + # [[16 17 -- 19] + # [-- -- -- --]]] + # + # >>> e.mean() + # + # >>> print(e.mean(axes=[0, 1]).array) + # [[[10.0 11.0 -- 13.0]]] + # >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) + # [[[9.666666666666666 10.666666666666666 -- 12.666666666666666]]] + # + # """ + # return self._collapse( + # mean_f, + # mean_fpartial, + # mean_ffinalise, + # axes=axes, + # squeeze=squeeze, + # weights=weights, + # mtol=mtol, + # inplace=inplace, + # _preserve_partitions=_preserve_partitions, + # ) def mean_absolute_value( self, @@ -12822,7 +12828,7 @@ def roll(self, axis, shift, inplace=False, i=False): def sum( self, axes=None, - weights=None, + weights=None, squeeze=False, mtol=1, inplace=False, @@ -12833,9 +12839,15 @@ def sum( from .collapse_functions import cf_sum d = _inplace_enabled_define_and_cleanup(self) - return _collapse( d, cf_sum, axis=axes, weights=weights, - keepdims=not squeeze, - split_every=split_every, mtol=mtol ) + return _collapse( + d, + cf_sum, + axis=axes, + weights=weights, + keepdims=not squeeze, + split_every=split_every, + mtol=mtol, + ) def sum_of_squares( self, @@ -13185,64 +13197,98 @@ def sd( _preserve_partitions=_preserve_partitions, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def var( self, axes=None, - squeeze=False, weights=None, + squeeze=False, mtol=1, - ddof=0, + ddof=0, inplace=False, + split_every=None, i=False, _preserve_partitions=False, ): - """Collapse axes with their weighted variance. - - The units of the returned array are the square of the units of the - array. - - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `range`, `sum`, - `sd`, `stats` - - :Parameters: - - axes : (sequence of) int, optional - - squeeze : bool, optional - - weights : - - {{inplace: `bool`, optional}} - - {{i: deprecated at version 3.0.0}} - - :Returns: - - `Data` or `None` - The collapsed array. - - **Examples:** - - """ - units = self.Units - if units: - units = units ** 2 - - return self._collapse( - var_f, - var_fpartial, - var_ffinalise, - axes=axes, - squeeze=squeeze, + from .collapse_functions import cf_var + + d = _inplace_enabled_define_and_cleanup(self) + _collapse( + d, + partial(cf_var, ddof=ddof), + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - units=units, - ddof=ddof, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + units = d.Units + if units: + d.override_units(units ** 2, inplace=True) + + return d + +# @_deprecated_kwarg_check("i") +# def var( +# self, +# axes=None, +# squeeze=False, +# weights=None, +# mtol=1, +# ddof=0, +# inplace=False, +# i=False, +# _preserve_partitions=False, +# ): +# """Collapse axes with their weighted variance. +# +# The units of the returned array are the square of the units of the +# array. +# +# .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `range`, `sum`, +# `sd`, `stats` +# +# :Parameters: +# +# axes : (sequence of) int, optional +# +# squeeze : bool, optional +# +# weights : +# +# {{inplace: `bool`, optional}} +# +# {{i: deprecated at version 3.0.0}} +# +# :Returns: +# +# `Data` or `None` +# The collapsed array. +# +# **Examples:** +# +# """ +# units = self.Units +# if units: +# units = units ** 2 +# +# return self._collapse( +# var_f, +# var_fpartial, +# var_ffinalise, +# axes=axes, +# squeeze=squeeze, +# weights=weights, +# mtol=mtol, +# units=units, +# ddof=ddof, +# inplace=inplace, +# _preserve_partitions=_preserve_partitions, +# ) + def section( self, axes, stop=None, chunks=False, min_step=1, mode="dictionary" ): @@ -13300,15 +13346,13 @@ def section( self, axes, data=True, stop=stop, chunks=chunks, min_step=min_step ) - - @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def sumw( self, axes=None, - weights=None, + weights=None, squeeze=False, mtol=1, inplace=False, @@ -13662,7 +13706,13 @@ def _where_broadcastable(data, x, name): def _collapse( - d, collapse_func, axis=None, weights=None, keepdims=True, split_every=None, mtol=1 + d, + collapse_func, + axis=None, + weights=None, + keepdims=True, + split_every=None, + mtol=1, ): """TODODASK.""" dx = d._get_dask() diff --git a/cf/data/utils.py b/cf/data/utils.py index 9a1a144ab3..cda7f77c6e 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -422,7 +422,7 @@ def dask_compatible(a): except AttributeError: return a - + def scalar_masked_array(dtype=float): """Return a scalar masked array. From 8678fa3240e3276f2936850ee292e4c93ae323da Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 14 Mar 2022 17:58:10 +0000 Subject: [PATCH 05/37] dev --- cf/data/collapse_functions.py | 1907 ++++++++++++--------------------- cf/data/data.py | 1330 +++++------------------ cf/test/test_Data.py | 480 +++++---- 3 files changed, 1235 insertions(+), 2482 deletions(-) diff --git a/cf/data/collapse_functions.py b/cf/data/collapse_functions.py index 9167e65fe8..aa4d4a27d0 100644 --- a/cf/data/collapse_functions.py +++ b/cf/data/collapse_functions.py @@ -1,1009 +1,61 @@ -from functools import partial -from functools import partial as functools_partial -from functools import reduce +from functools import partial, reduce from operator import mul import numpy as np -from numpy import abs as numpy_abs -from numpy import array as numpy_array -from numpy import asanyarray as numpy_asanyarray -from numpy import average as numpy_average -from numpy import expand_dims as numpy_expand_dims -from numpy import maximum as numpy_maximum -from numpy import minimum as numpy_minimum -from numpy import ndim as numpy_ndim -from numpy.ma import array as numpy_ma_array -from numpy.ma import average as numpy_ma_average -from numpy.ma import expand_dims as numpy_ma_expand_dims -from numpy.ma import isMA as numpy_ma_isMA -from numpy.ma import nomask as numpy_ma_nomask -from numpy.ma import where as numpy_ma_where - -from ..functions import broadcast_array - - -#def asanyarray(*args): -# """Return every input array as an numpy ndarray, or a subclass of. -# -# :Parameters: -# -# args: sequence of array-like input objects -# -# :Returns: -# -# `list` -# The input objects left as, else converted to, `numpy.ndarray` -# -# """ -# out = [] -# for x in args: -# if x is not None and not np.ndim(x): -# # Make sure that we have a numpy array (as opposed to, e.g. a -# # numpy.float64) -# out.append(np.asanyarray(x)) -# else: -# out.append(x) -# -# return out - - -def asanyarray(x): - """Return every input array as an numpy ndarray, or a subclass of. - - :Parameters: - - args: sequence of array-like input objects - - :Returns: - - `list` - The input objects left as, else converted to, `numpy.ndarray` - - """ - if not np.ndim(x): - # Make sure that we have a numpy array (e.g. as opposed to a - # numpy.float64) - return np.asanyarray(x) - - return x - - -def psum(x, y): - """Add two arrays element-wise. - - If either or both of the arrays are masked then the output array is - masked only where both input arrays are masked. - - :Parameters: - - x: numpy array-like - *Might be updated in place*. - - y: numpy array-like - Will not be updated in place. - - :Returns: - - `numpy.ndarray` - - **Examples:** - - >>> c = psum(a, b) - - """ - if numpy_ma_isMA(x): - if numpy_ma_isMA(y): - # x and y are both masked - x_mask = x.mask - x = x.filled(0) - x += y.filled(0) - x = numpy_ma_array(x, mask=x_mask & y.mask, copy=False) - else: - # Only x is masked - x = x.filled(0) - x += y - elif numpy_ma_isMA(y): - # Only y is masked - x += y.filled(0) - else: - # x and y are both unmasked - x += y - - return x - - -def pmax(x, y): - """The element-wise maximum of two arrays. - - :Parameters: - - x: array-like - May be updated in place and should not be used again. - - y: array-like - Will not be updated in place. - - :Returns: - - `numpy.ndarray` - - """ - if numpy_ma_isMA(x): - if numpy_ma_isMA(y): - # x and y are both masked - z = numpy_maximum(x, y) - z = numpy_ma_where(x.mask & ~y.mask, y, z) - x = numpy_ma_where(y.mask & ~x.mask, x, z) - if x.mask is numpy_ma_nomask: # not numpy_any(x.mask): - x = numpy_array(x) - else: - # Only x is masked - z = numpy_maximum(x, y) - x = numpy_ma_where(x.mask, y, z) - if x.mask is numpy_ma_nomask: # not numpy_any(x.mask): - x = numpy_array(x) - elif numpy_ma_isMA(y): - # Only y is masked - z = numpy_maximum(x, y) - x = numpy_ma_where(y.mask, x, z) - if x.mask is numpy_ma_nomask: # not numpy_any(x.mask): - x = numpy_array(x) - else: - # x and y are both unmasked - if not numpy_ndim(x): - # Make sure that we have a numpy array (as opposed to, - # e.g. a numpy.float64) - x = numpy_asanyarray(x) - - numpy_maximum(x, y, out=x) - - return x - - -def pmin(x, y): - """The element-wise minimum of two arrays. - - :Parameters: - - x: `numpy.ndarray` - May be updated in place and should not be used again. - - y: `numpy.ndarray` - Will not be updated in place. - - :Returns: - - `numpy.ndarray` - - """ - if numpy_ma_isMA(x): - if numpy_ma_isMA(y): - # x and y are both masked - z = numpy_minimum(x, y) - z = numpy_ma_where(x.mask & ~y.mask, y, z) - x = numpy_ma_where(y.mask & ~x.mask, x, z) - if x.mask is numpy_ma_nomask: - x = numpy_array(x) - else: - # Only x is masked - z = numpy_minimum(x, y) - x = numpy_ma_where(x.mask, y, z) - if x.mask is numpy_ma_nomask: - x = numpy_array(x) - elif numpy_ma_isMA(y): - # Only y is masked - z = numpy_minimum(x, y) - x = numpy_ma_where(y.mask, x, z) - if x.mask is numpy_ma_nomask: - x = numpy_array(x) - else: - # x and y are both unmasked - if not numpy_ndim(x): - # Make sure that we have a numpy array (as opposed to, - # e.g. a numpy.float64) - x = numpy_asanyarray(x) - - numpy_minimum(x, y, out=x) - - return x - - -def mask_where_too_few_values(Nmin, N, x): - """Mask elements of N and x where N is strictly less than Nmin. - - :Parameters: - - Nmin: `int` - - N: `numpy.ndarray` - - x: `numpy.ndarray` - - :Returns: - - (`numpy.ndarray`, `numpy.ndarray`) - A tuple containing *N* and *x*, both masked where *N* is - strictly less than *Nmin*. - - """ - if N.min() < Nmin: - mask = N < Nmin - N = numpy_ma_array(N, mask=mask, copy=False, shrink=False) - x = numpy_ma_array(x, mask=mask, copy=False, shrink=True) - - return asanyarray(N, x) - - -def mask_small_sample_size(x, N, axis, mtol, original_shape): - """Mask elements of N and x where N is strictly less than Nmin. - - :Parameters: - - Nmin: `int` - - N: `numpy.ndarray` - - x: `numpy.ndarray` - - :Returns: - - (`numpy.ndarray`, `numpy.ndarray`) - A tuple containing *N* and *x*, both masked where *N* is - strictly less than *Nmin*. - - """ - if mtol < 1: - Nmax = reduce(mul, [original_shape[i] for i in axis], 1) - x = np.ma.masked_where(N < (1 - mtol) * Nmax, x, copy=False) - - return asanyarray(x) - - -def double_precision(a): - """Convert the input array to double precision. - - :Parameters: - - a: `numpy.ndarray` - - :Returns: - - `numpy.ndarray` - - """ - char = a.dtype.char - if char == "f": - newtype = float - elif char == "i": - newtype = int - else: - return a - - if numpy_ma_isMA(a): - return a.astype(newtype) - else: - return a.astype(newtype, copy=False) - - -# -------------------------------------------------------------------- -# mean -# -------------------------------------------------------------------- -def mean_f(a, axis=None, weights=None, masked=False): - """The weighted average along the specified axes. - - :Parameters: - - a: array-like - Input array. Not all missing data - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - masked: `bool`, optional - - :Returns: - - 3-`tuple` of `numpy.ndarray` - The sample size, average and sum of weights inside a 3-tuple. - - """ - a = double_precision(a) - - if masked: - average = numpy_ma_average - else: - average = numpy_average - - avg, sw = average(a, axis=axis, weights=weights, returned=True) - - if not numpy_ndim(avg): - avg = numpy_asanyarray(avg) - sw = numpy_asanyarray(sw) - - if weights is None: - N = sw.copy() - else: - (N,) = sample_size_f(a, axis=axis, masked=masked) - - return asanyarray(N, avg, sw) - - -def mean_fpartial(out, out1=None, group=False): - """Return the partial sample size, the partial sum and partial sum - of the weights. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - Either an output from a previous call to `mean_fpartial`; - or, if *out1* is `None`, an output from `mean_f`. - - out1: 3-`tuple` of `numpy.ndarray`, optional - An output from `mean_f`. - - :Returns: - - 3-`tuple` of `numpy.ndarray` - The sample size, average and sum of weights inside a 3-tuple. - - """ - N, avg, sw = out - - if out1 is None and not group: - # This is the first partition to be processed - - # Convert the partition average to a partition sum - avg *= sw - else: - # Combine this partition with existing partial combination - N1, avg1, sw1 = out1 - - # Convert the partition average to a partition sum - if not group: - avg1 *= sw1 - - N = psum(N, N1) - avg = psum(avg, avg1) # Now a partial sum - sw = psum(sw, sw1) - - return asanyarray(N, avg, sw) - - -def mean_ffinalise(out, sub_samples=None): - """Divide the weighted sum by the sum of weights. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - An output from `mean_fpartial`. - - sub_samples: optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the mean. - - """ - N, avg, sw = out - - if sub_samples: - avg /= sw - - return mask_where_too_few_values(1, N, avg) - - -# -------------------------------------------------------------------- -# mean_absolute_value -# -------------------------------------------------------------------- -def mean_abs_f(a, axis=None, weights=None, masked=False): - """Return the mean of the absolute array, or the means of the - absolute array along an axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input is - used. - - masked: `bool` - - :Returns: - - 2-tuple of `numpy.ndarray` - The sample sizes and the means of the absolute values. - - """ - return mean_f(numpy_abs(a), axis=axis, weights=weights, masked=masked) - - -mean_abs_fpartial = mean_fpartial -mean_abs_ffinalise = mean_ffinalise - - -# -------------------------------------------------------------------- -# root_mean_square -# -------------------------------------------------------------------- -def root_mean_square_f(a, axis=None, weights=None, masked=False): - """The RMS along the specified axes. - - :Parameters: - - a: array-like - Input array. Not all missing data - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - masked: `bool`, optional - - :Returns: - - `tuple` - 3-tuple. - - """ - a = double_precision(a) - - return mean_f(a ** 2, axis=axis, weights=weights, masked=masked) - - -root_mean_square_fpartial = mean_fpartial - - -def root_mean_square_ffinalise(out, sub_samples=None): - """Divide the weighted sum by the sum of weights and take the square - root. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - An output from `root_mean_square_fpartial`. - - sub_samples: optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the RMS. - - """ - N, avg = mean_ffinalise(out, sub_samples=sub_samples) - - avg **= 0.5 - - return asanyarray(N, avg) - - -def sum_f(a, axis=None, weights=None, masked=False): - """Return the sum of an array or the sum along an axis. - - ``sum_f(a, axis=axis)`` is equivalent to ``(numpy.sum(a, - axis=axis),)`` - - :Parameters: - - a: numpy array-like - Input array - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum. - - """ - a = double_precision(a) - - (N,) = sample_size_f(a, axis=axis, masked=masked) - - if weights is not None: - # A weights array has been provided - weights = double_precision(weights) - - if weights.ndim < a.ndim: - weights = broadcast_array(weights, a.shape) - - a = a * weights - - asum = a.sum(axis=axis) - - if not numpy_ndim(asum): - asum = numpy_asanyarray(asum) - - return asanyarray(N, asum) - - -def sum_fpartial(out, out1=None, group=False): - """Return the partial sum of an array. - - :Parameters: - - out: 2-`tuple` of `numpy.ndarray` - - out1: 2-`tuple` of `numpy.ndarray`, optional - - group: *optional* - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum. - - """ - N, asum = out - - if out1 is not None: - N1, asum1 = out1 - N = psum(N, N1) - asum = psum(asum, asum1) - - return asanyarray(N, asum) - - -def sum_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the sum of an array. - - Here mask out any values derived from a too-small sample size. - - :Parameters: - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum. - - """ - return mask_where_too_few_values(1, *out) - - -# --------------------------------------------------------------------- -# sum_of_squares -# --------------------------------------------------------------------- -def sum_of_squares_f(a, axis=None, weights=None, masked=False): - """Return the sum of the square of an array or the sum of squares - along an axis. - - :Parameters: - - a: numpy array-like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - :Returns: - - `tuple` - 2-tuple - - """ - a = double_precision(a) - return sum_f(a ** 2, axis=axis, weights=weights, masked=masked) - - -sum_of_squares_fpartial = sum_fpartial -sum_of_squares_ffinalise = sum_ffinalise - - -# --------------------------------------------------------------------- -# Sum of weights -# --------------------------------------------------------------------- -def sw_f( - a, axis=None, masked=False, weights=None, N=None, sum_of_squares=False -): - """Return the sum of weights for an array. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: `bool`, optional - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - N: `numpy.ndarray` - Sample size - - sum_of_squares: delta degrees of freedom - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum of weights. - - """ - if N is None: - (N,) = sample_size_f(a, axis=axis, masked=masked) - - if weights is not None: - # A weights array has been provided - weights = double_precision(weights) - - if weights.ndim < a.ndim: - weights = broadcast_array(weights, a.shape) - - if masked: - weights = numpy_ma_array(weights, mask=a.mask, copy=False) - - if sum_of_squares: - weights = weights * weights - - sw = weights.sum(axis=axis) - - if not numpy_ndim(sw): - sw = numpy_asanyarray(sw) - else: - # The sum of weights is equal to the sample size (i.e. an - # unweighted sample) - sw = N.copy() - - return asanyarray(N, sw) - - -sw_fpartial = sum_fpartial -sw_ffinalise = sum_ffinalise - -# --------------------------------------------------------------------- -# Sum of squares of weights -# --------------------------------------------------------------------- -sw2_f = functools_partial(sw_f, sum_of_squares=True) -sw2_fpartial = sum_fpartial -sw2_ffinalise = sum_ffinalise - - -# --------------------------------------------------------------------- -# Variance -# --------------------------------------------------------------------- -def var_f(a, axis=None, weights=None, masked=False, ddof=0): - """Return a tuple containing metrics relating to the array variance. - - The tuple is a 7-tuple that contains, in the order given, the - following variables: - - ======== ============================================================ - Variable Description - ======== ============================================================ - N Sample size - - var Sample variance (ddof=0) - - avg Weighted mean - - V1 Sum of weights - - V2 Sum of squares of weights - - ddof Delta degrees of freedom - - weighted Whether or not the sample is weighted - ======== ============================================================ - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - masked: `bool`, optional - - ddof: delta degrees of freedom, optional - - :Returns: - - 7-`tuple` - Tuple containing the value of the statistical metrics described - in the above table, in the given order. - - """ - # Make sure that a is double precision - a = double_precision(a) - - weighted = weights is not None - - # ---------------------------------------------------------------- - # Methods: - # - # http://en.wikipedia.org/wiki/Standard_deviation#Population-based_statistics - # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance - # ---------------------------------------------------------------- - - # Calculate N = number of data points - # Calculate avg = mean of data - # Calculate V1 = sum of weights - N, avg, V1 = mean_f(a, weights=weights, axis=axis, masked=masked) - - # Calculate V2 = sum of squares of weights - if weighted and ddof == 1: - N, V2 = sw2_f(a, axis=axis, masked=masked, weights=weights, N=N) - else: - V2 = None - - if axis is not None and avg.size > 1: - # We collapsed over a single axis and the array has 2 or more - # axes, so add an extra size 1 axis to the mean so that - # broadcasting works when we calculate the variance. - reshape_avg = True - if masked: - expand_dims = numpy_ma_expand_dims - else: - expand_dims = numpy_expand_dims - - avg = expand_dims(avg, axis) - else: - reshape_avg = False - - var = a - avg - var *= var - - if masked: - average = numpy_ma_average - else: - average = numpy_average - - var = average(var, axis=axis, weights=weights) - - if reshape_avg: - shape = avg.shape - avg = avg.reshape(shape[:axis] + shape[axis + 1 :]) - - (N, var, avg, V1, V2) = asanyarray(N, var, avg, V1, V2) - - return (N, var, avg, V1, V2, ddof, weighted) - - -def var_fpartial(out, out1=None, group=False): - """Return a tuple of partial metrics relating to the array variance. - - The tuple is a 7-tuple that contains, in the order given, the - following variables: - - ======== ============================================================ - Variable Description - ======== ============================================================ - N Partial sample size - - var Partial sum of V1*(variance + mean^2) - - avg Unweighted partial sum - - V1 Partial sum of weights - - V2 Partial sum of squares of weights - - ddof Delta degrees of freedom - - weighted Whether or not the population is weighted - ======== ============================================================ - - For further information, see: - https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics - - :Parameters: - - out: 7-`tuple` - - out1: 7-`tuple`, optional - - :Returns: - - 7-`tuple` - Tuple containing the value of the statistical metrics described - in the above table, in the given order. - - """ - (N, var, avg, V1, V2, ddof, weighted) = out - - if out1 is None and not group: - # ------------------------------------------------------------ - # var = V1(var+avg**2) - # avg = V1*avg = unweighted partial sum - # ------------------------------------------------------------ - var += avg * avg - var *= V1 - avg *= V1 - else: - # ------------------------------------------------------------ - # var = var + V1b(varb+avgb**2) - # avg = avg + V1b*avgb - # V1 = V1 + V1b - # V2 = V2 + V2b - # ------------------------------------------------------------ - (Nb, varb, avgb, V1b, V2b, ddof, weighted) = out1 - - N = psum(N, Nb) - - if not group: - varb += avgb * avgb - varb *= V1b - avgb *= V1b - - var = psum(var, varb) - avg = psum(avg, avgb) - V1 = psum(V1, V1b) - - if weighted and ddof == 1: - V2 = psum(V2, V2b) - # --- End: if - - (N, var, avg, V1, V2) = asanyarray(N, var, avg, V1, V2) - - return (N, var, avg, V1, V2, ddof, weighted) - +from dask.array import chunk +from dask.array.core import _concatenate2 +from dask.array.reductions import divide, numel, reduction +from dask.core import flatten +from dask.utils import deepmap # Apply function inside nested lists -def var_ffinalise(out, sub_samples=None): - """Calculate the variance of the array and return it with the sample - size. - Also mask out any values derived from a too-small sample size. +def mask_small_sample_size(x, N, axis, mtol, original_shape): + """Mask elements where the sample size of the collapsed data is + below a threshold. :Parameters: - out: 7-`tuple` - - sub_samples: optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the variance. - - """ - (N, var, avg, V1, V2, ddof, weighted) = out - - N, var = mask_where_too_few_values(max(2, ddof + 1), N, var) - N, V1 = mask_where_too_few_values(max(2, ddof + 1), N, V1) - if V2 is not None: - N, V2 = mask_where_too_few_values(max(2, ddof + 1), N, V2) - - if sub_samples: - # ---------------------------------------------------------------- - # The global biased variance = {[SUM(pV1(pv+pm**2)]/V1} - m**2 - # - # where pV1 = partial sum of weights - # pv = partial biased variance - # pm = partial mean - # V1 = global sum of weights - # m = global mean - # - # Currently: var = SUM(pV1(pv+pm**2) - # avg = V1*m - # - # https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics - # - # For the general case of M non-overlapping data sets, X_{1} - # through X_{M}, and the aggregate data set X=\bigcup_{i}X_{i} - # we have the unweighted mean and variance is: - # - # \mu_{X}={\frac{1}{\sum_{i}{N_{X_{i}}}}}\left(\sum_{i}{N_{X_{i}}\mu_{X_{i}}}\right) - # - # var_{X}={{\frac{1}{\sum_{i}{N_{X_{i}}-ddof}}}\left(\sum_{i}{\left[(N_{X_{i}}-1)\sigma_{X_{i}}^{2}+N_{X_{i}}\mu_{X_{i}}^{2}\right]}-\left[\sum_{i}{N_{X_{i}}}\right]\mu_{X}^{2}\right)} - # - # ---------------------------------------------------------------- - avg /= V1 - avg *= avg - var /= V1 - var -= avg - - # ---------------------------------------------------------------- - # var is now the biased global variance - # ---------------------------------------------------------------- - if not weighted: - if ddof: - # The unweighted variance with N-ddof degrees of freedom is - # [V1/(V1-ddof)]*var. In this case V1 equals the sample size, - # N. ddof=1 provides an unbiased estimator of the variance of - # a hypothetical infinite population. - V1 /= V1 - ddof - var *= V1 - elif ddof == 1: - # Calculate the weighted unbiased variance. The unbiased - # variance weighted with _reliability_ weights is - # [V1**2/(V1**2-V2)]*var. - # - # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance - V1 **= 2 - var *= V1 - V1 -= V2 - var /= V1 - elif ddof: - raise ValueError( - "Can only calculate a weighted variance with a delta degrees " - "of freedom (ddof) of 0 or 1: Got {}".format(ddof) - ) - - return asanyarray(N, var) - - -# --------------------------------------------------------------------- -# standard_deviation -# --------------------------------------------------------------------- -sd_f = var_f -sd_fpartial = var_fpartial - + x: `numpy.ndarray` + The collapsed data. -def sd_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the standard - deviation. + N: `numpy.ndarray` + The sample sizes of the collapsed values. - :Parameters: + axis: sequence of `int` + The axes being collapsed. - out: `tuple` - A 2-tuple + mtol: number - sub_samples: *optional* - Ignored. + original_shape: `tuple` + The shape of the original, uncollapsed data. :Returns: - 2-`tuple` of `numpy.ndarray` - The sample size and the standard deviation. + `numpy.ndarray` + Array *x* masked where *N* is sufficiently small. Note + that input *x* may be modified in-place with the output. """ - N, sd = var_ffinalise(out, sub_samples) - - sd **= 0.5 - - return asanyarray(N, sd) + if not x.ndim: + # Make sure that we have a numpy array (e.g. as opposed to a + # numpy.float64) + x = np.asanyarray(x) + if mtol < 1: + # Nmax = total number of element, including an missing values + Nmax = reduce(mul, [original_shape[i] for i in axis], 1) + x = np.ma.masked_where(N < (1 - mtol) * Nmax, x, copy=False) -from dask.array import chunk -from dask.array.core import _concatenate2 -from dask.array.reductions import divide, numel, reduction -from dask.core import flatten -from dask.utils import deepmap # Apply function inside nested lists + return x def sum_of_weights( - x, weights=None, dtype="f8", N=None, squared=False, **kwargs + x, weights=None, squared=False, dtype="f8", N=None, **kwargs ): """TODO.""" if weights is None: if N is None: - N = cf_sample_size_chunk(x, dtype=dtype, **kwargs)["N"] + N = cf_sample_size_chunk(x, **kwargs)["N"] return N @@ -1017,17 +69,12 @@ def sum_of_weights( def combine_arrays( - pairs, key, func, axis, dtype, computing_meta=False, **kwargs + pairs, key, func, axis, dtype=None, computing_meta=False, **kwargs ): - """rename *key*""" # Create a nested list of N and recursively concatenate it - # along the specified axes - if isinstance(key, str): - dm_func = lambda pair: pair[key] - else: - dm_func = key + # along the specified + x = deepmap(lambda pair: pair[key], pairs) if not computing_meta else pairs - x = deepmap(dm_func, pairs) if not computing_meta else pairs if dtype: kwargs["dtype"] = dtype @@ -1057,26 +104,60 @@ def min_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): def sum_sample_sizes(pairs, axis, **kwargs): """Alias of `combine_arrays` with ``key="N", func=chunk.sum, - computing_meta=False, dtype="i8"``.""" - return combine_arrays(pairs, "N", chunk.sum, axis, "i8", False, **kwargs) + dtype="i8", computing_meta=False``.""" + return combine_arrays( + pairs, "N", chunk.sum, axis, dtype="i8", computing_meta=False, **kwargs + ) # -------------------------------------------------------------------- # mean # -------------------------------------------------------------------- def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): - """Find the max of an array.""" + """Return chunk-based values for calculating the global mean. + + :Parameters: + + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. + + computing_meta: `bool` optional + See `dask.array.reductions` for details. + + kwargs: `dict`, optional + See `dask.array.reductions` for details. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + * V1: The sum of ``weights`` (set to ``N`` if weights are + not present). + + * sum: The weighted sum of ``x``. + + """ if computing_meta: return x - d = cf_sum_chunk(x, weights=weights, **kwargs) + # N, sum + d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) if weights is None: - sw = d["N"] + d["V1"] = d["N"] else: - sw = chunk.sum(weights, **kwargs) + d["V1"] = chunk.sum(weights, dtype=dtype, **kwargs) - d["sw"] = sw return d @@ -1092,7 +173,7 @@ def cf_mean_combine( pairs = [pairs] d = {} - for key in ("sum", "sw"): + for key in ("sum", "V1"): d[key] = sum_arrays(pairs, key, axis, dtype, computing_meta, **kwargs) if computing_meta: return d[key] @@ -1116,7 +197,7 @@ def cf_mean_agg( if computing_meta: return d - x = divide(d["sum"], d["sw"]) # dtype? + x = divide(d["sum"], d["V1"], dtype=dtype) x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) return x @@ -1143,10 +224,106 @@ def cf_mean( # -------------------------------------------------------------------- -# max +# mean_absolute_value +# -------------------------------------------------------------------- +def cf_mean_abs_chunk( + x, weights=None, dtype=None, computing_meta=False, **kwargs +): + """Return chunk-based values for calculating the global absolute + mean. + + :Parameters: + + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. + + computing_meta: `bool` optional + See `dask.array.reductions` for details. + + kwargs: `dict`, optional + See `dask.array.reductions` for details. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + * V1: The sum of the weights (set to N if weights are not + present). + + * sum: The weighted sum of ``abs(x)``. + + """ + if computing_meta: + return x + + return cf_mean_chunk(np.abs(x), weights, dtype=dtype, **kwargs) + + +def cf_mean_abs( + a, weights=None, axis=None, keepdims=False, mtol=None, split_every=None +): + """TODODASK.""" + dtype = a.dtype + return reduction( + a, + cf_mean_abs_chunk, + partial(cf_mean_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_mean_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + +# -------------------------------------------------------------------- +# maximum # -------------------------------------------------------------------- -def cf_max_chunk(x, computing_meta=False, **kwargs): - """Find the max of an array.""" +def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Return chunk-based values for calculating the global maximum. + + :Parameters: + + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. Ignored. + + computing_meta: `bool` optional + See `dask.array.reductions` for details. + + kwargs: `dict`, optional + See `dask.array.reductions` for details. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + * max: The maximum of `x``. + + """ if computing_meta: return x @@ -1217,25 +394,36 @@ def cf_max(a, axis=None, keepdims=False, mtol=None, split_every=None): # -------------------------------------------------------------------- # maximum_absolute_value # -------------------------------------------------------------------- -def cf_max_abs_chunk(x, computing_meta=False, **kwargs): - """Return the maximum of the absolute array, or the maximum of the - absolute array along an axis. +def cf_max_abs_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Return chunk-based values for calculating the global absolute + max. :Parameters: - a: numpy array_like - Input array + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. + computing_meta: `bool` optional + See `dask.array.reductions` for details. - masked: bool + kwargs: `dict`, optional + See `dask.array.reductions` for details. :Returns: - 2-tuple of numpy arrays - The sample sizes and the maxima of the absolute values. + `dict` + Dictionary with the keys: + + * N: The sample size. + + * max: The maximum of ``abs(x)``. """ if computing_meta: @@ -1244,15 +432,58 @@ def cf_max_abs_chunk(x, computing_meta=False, **kwargs): return cf_max_chunk(np.abs(x), **kwargs) -cf_max_abs_combine = cf_max_combine -cf_max_abs_agg = cf_max_agg +def cf_max_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): + """TODODASK.""" + dtype = a.dtype + return reduction( + a, + cf_max_abs_chunk, + partial(cf_max_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_max_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + ) # -------------------------------------------------------------------- -# min +# minimum # -------------------------------------------------------------------- -def cf_min_chunk(x, computing_meta=False, **kwargs): - """Find the max of an array.""" +def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Return chunk-based values for calculating the global minimum. + + :Parameters: + + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. + + computing_meta: `bool` optional + See `dask.array.reductions` for details. + + kwargs: `dict`, optional + See `dask.array.reductions` for details. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + * min: The minimum of ``x``. + + """ if computing_meta: return x @@ -1323,25 +554,36 @@ def cf_min(a, axis=None, keepdims=False, mtol=None, split_every=None): # -------------------------------------------------------------------- # minimum absolute value # -------------------------------------------------------------------- -def cf_min_abs_chunk(x, computing_meta=False, **kwargs): - """Return the maximum of the absolute array, or the maximum of the - absolute array along an axis. +def cf_min_abs_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Return chunk-based values for calculating the global absolute + min. :Parameters: - a: numpy array_like - Input array + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. + computing_meta: `bool` optional + See `dask.array.reductions` for details. - masked: bool + kwargs: `dict`, optional + See `dask.array.reductions` for details. :Returns: - 2-tuple of numpy arrays - The sample sizes and the maxima of the absolute values. + `dict` + Dictionary with the keys: + + * N: The sample size. + + * min: The minimum of ``abs(x)``. """ if computing_meta: @@ -1350,15 +592,60 @@ def cf_min_abs_chunk(x, computing_meta=False, **kwargs): return cf_min_chunk(np.abs(x), **kwargs) -cf_min_abs_combine = cf_min_combine -cf_min_abs_agg = cf_min_agg +def cf_min_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): + """TODODASK.""" + dtype = a.dtype + return reduction( + a, + cf_min_abs_chunk, + partial(cf_min_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_min_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + ) # -------------------------------------------------------------------- # range # -------------------------------------------------------------------- def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Find the max and min of an array.""" + """Return chunk-based values for calculating the global range. + + :Parameters: + + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. + + computing_meta: `bool` optional + See `dask.array.reductions` for details. + + kwargs: `dict`, optional + See `dask.array.reductions` for details. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + * min: The minimum of ``x``. + + * max: The maximum of ``x`. + + """ if computing_meta: return x @@ -1430,14 +717,96 @@ def cf_range(a, axis=None, keepdims=False, mtol=None, split_every=None): ) -# -------------------------------------------------------------------- -# mid-range -# -------------------------------------------------------------------- -cf_mid_range_chunk = cf_range_chunk -cf_mid_range_combine = cf_range_combine +# -------------------------------------------------------------------- +# mid-range +# -------------------------------------------------------------------- +cf_mid_range_chunk = cf_range_chunk +cf_mid_range_combine = cf_range_combine + + +def cf_mid_range_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Find the mid-range of a nested list of arrays.""" + d = cf_range_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + # Calculate the mid-range + x = divide(d["max"] + d["min"], 2.0, dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +def cf_mid_range( + a, axis=None, dtype=float, keepdims=False, mtol=None, split_every=None +): + """TODODASK.""" + dtype = float + return reduction( + a, + cf_mid_range_chunk, + partial(cf_mid_range_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_mid_range_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + +# -------------------------------------------------------------------- +# root mean square +# -------------------------------------------------------------------- +def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): + """Return chunk-based values for calculating the global RMS. + + :Parameters: + + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. + + computing_meta: `bool` optional + See `dask.array.reductions` for details. + kwargs: `dict`, optional + See `dask.array.reductions` for details. -def cf_mid_range_agg( + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + * sum: The weighted sum of ``x**2``. + + """ + if computing_meta: + return x + + return cf_mean_chunk( + np.multiply(x, x, dtype=dtype), weights=weights, dtype=dtype, **kwargs + ) + + +def cf_rms_agg( pairs, axis=None, dtype="f8", @@ -1446,34 +815,35 @@ def cf_mid_range_agg( original_shape=None, **kwargs, ): - """Find the mid-range of a nested list of arrays.""" - d = cf_range_combine(pairs, axis, dtype, computing_meta, **kwargs) + """Apply the function to the data in a nested list of arrays and + mask where the sample size is below the threshold.""" + d = cf_sum_combine(pairs, axis, computing_meta, **kwargs) if computing_meta: return d - # Calculate the mid-range - x = divide(d["max"] + d["min"], 2.0, dtype=dtype) + x = np.sqrt(d["sum"], dtype=dtype) x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) return x -def cf_mid_range( - a, axis=None, dtype=float, keepdims=False, mtol=None, split_every=None +def cf_rms( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None ): """TODODASK.""" dtype = float return reduction( a, - cf_mid_range_chunk, - partial(cf_mid_range_agg, mtol=mtol, original_shape=a.shape), + cf_rms_chunk, + partial(cf_rms_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, dtype=dtype, split_every=split_every, - combine=cf_mid_range_combine, + combine=cf_sum_combine, out=None, concatenate=False, meta=np.array((), dtype=dtype), + weights=weights, ) @@ -1481,6 +851,34 @@ def cf_mid_range( # sample_size # -------------------------------------------------------------------- def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): + """Return chunk-based values for calculating the global sample size. + + :Parameters: + + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. + + computing_meta: `bool` optional + See `dask.array.reductions` for details. + + kwargs: `dict`, optional + See `dask.array.reductions` for details. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + """ if computing_meta: return x @@ -1550,7 +948,36 @@ def cf_sample_size(a, axis=None, keepdims=False, mtol=None, split_every=None): # sum # -------------------------------------------------------------------- def cf_sum_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): - """Find the max of an array.""" + """Return chunk-based values for calculating the global sum. + + :Parameters: + + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. + + computing_meta: `bool` optional + See `dask.array.reductions` for details. + + kwargs: `dict`, optional + See `dask.array.reductions` for details. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + * sum: The weighted sum of ``x``. + + """ if computing_meta: return x @@ -1573,7 +1000,6 @@ def cf_sum_combine( if not isinstance(pairs, list): pairs = [pairs] - # Create a nested list of maxima and recursively concatenate it # along the specified axes x = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) @@ -1628,74 +1054,233 @@ def cf_sum( # -------------------------------------------------------------------- -# variance +# sum of sqaures # -------------------------------------------------------------------- -def cf_var_chunk( - x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs +def cf_sum_of_squares_chunk( + x, weights=None, dtype="f8", computing_meta=False, **kwargs ): - """Return a tuple containing metrics relating to the array variance. + """Return chunk-based values for calculating the global sum. + + :Parameters: + + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. - The tuple is a 7-tuple that contains, in the order given, the - following variables: + computing_meta: `bool` optional + See `dask.array.reductions` for details. - ======== ============================================================ - Variable Description - ======== ============================================================ - N Sample size + kwargs: `dict`, optional + See `dask.array.reductions` for details. - var Sample variance (ddof=0) + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + * sum: The weighted sum of ``x**2`` + + """ + if computing_meta: + return x - avg Weighted mean + return cf_sum_chunk( + np.multiply(x, x, dtype=dtype), weights, dtype=dtype, **kwargs + ) - V1 Sum of weights - V2 Sum of squares of weights +def cf_sum_of_squares( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None +): + """TODODASK.""" + dtype = float + return reduction( + a, + cf_sum_of_squares_chunk, + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) - ddof Delta degrees of freedom - weighted Whether or not the sample is weighted - ======== ============================================================ +# -------------------------------------------------------------------- +# sum of weights +# -------------------------------------------------------------------- +def cf_sum_of_weights_chunk( + x, weights=None, dtype="f8", computing_meta=False, squared=False, **kwargs +): + """Return chunk-based values for calculating the global sum. :Parameters: - a: numpy array_like - Input array + x: numpy.ndarray + Chunks data being reduced along one or more axes. + + weights: numpy array-like, optional + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. + + computing_meta: `bool` optional + See `dask.array.reductions` for details. + + kwargs: `dict`, optional + See `dask.array.reductions` for details. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + * sum: The sum of ``weights``, or the sum of + ``weights**2`` if *squared* is True. + + """ + if computing_meta: + return x + + # N + d = cf_sample_size_chunk(x, **kwargs) + + # sum + d["sum"] = sum_of_weights( + x, weights=weights, dtype=dtype, N=d["N"], squared=squared, **kwargs + ) + return d + + +def cf_sum_of_weights( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None +): + """TODODASK.""" + dtype = float + return reduction( + a, + cf_sum_of_weights_chunk, + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + +def cf_sum_of_weights2( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None +): + """TODODASK.""" + dtype = float + return reduction( + a, + partial(cf_sum_of_weights_chunk, squared=True), + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + out=None, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + +# -------------------------------------------------------------------- +# variance +# -------------------------------------------------------------------- +def cf_var_chunk( + x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs +): + """Return chunk-based values for calculating the global variance. + + .. note:: If weights are provided then they are interpreted as + reliability weights, as opposed to frequency weights + (where a weight equals the number of occurrences). + + :Parameters: - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. + x: numpy.ndarray + Chunks data being reduced along one or more axes. weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. + Weights to be used in the reduction of *x*, with the same + shape as *x*. By default the reduction is unweighted. + + dtype: data_type, optional + Data type of global reduction. - masked: `bool`, optional + ddof: number, optional + The delta degrees of freedom. The number of degrees of + freedom used in the calculation is (N-*ddof*) where N + represents the number of non-missing elements. By default + *ddof* is 0, for the biased variance. Setting ddof to + ``1`` applies Bessel's correction + (https://en.wikipedia.org/wiki/Bessel's_correction) - ddof: delta degrees of freedom, optional + computing_meta: `bool` optional + See `dask.array.reductions` for details. + + kwargs: `dict`, optional + See `dask.array.reductions` for details. :Returns: - 7-`tuple` - Tuple containing the value of the statistical metrics described - in the above table, in the given order. + `dict` + Dictionary with the keys: + + * N: The sample size. + + * V1: The sum of ``weights`` (set to ``N`` if weights are + not set). + + * sum: The weighted sum of ``x``. + + * part: ``V1 * (sigma**2 + mu**2)``, where ``sigma**2`` is + the weighted biased (i.e. ``ddof=0``) variance of + ``x``, and ``mu`` is the weighted mean of + ``x``. See + https://en.wikipedia.org/wiki/Pooled_variance#Sample-based_statistics + for details. + + * V2: The sum of ``weights**2``. Only present if *weights* + are set and ``ddof=1``. """ - # ---------------------------------------------------------------- - # Methods: - # - # http://en.wikipedia.org/wiki/Standard_deviation#Population-based_statistics - # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance - # ---------------------------------------------------------------- if computing_meta: return x + # N, V1, sum d = cf_mean_chunk(x, weights, dtype=dtype, **kwargs) wsum = d["sum"] - V1 = d["sw"] - N = d["N"] - - # with np.errstate(divide="ignore", invalid="ignore"):?????? + V1 = d["V1"] + + # part avg = divide(wsum, V1, dtype=dtype) part = x - avg part *= part @@ -1704,13 +1289,7 @@ def cf_var_chunk( part = chunk.sum(part, dtype=dtype, **kwargs) part = part + avg * wsum - - d = { - "part": part, - "wsum": wsum, - "N": N, - "V1": V1, - } + d["part"] = part if weights is not None and ddof == 1: d["V2"] = sum_of_weights(x, weights, squared=True, **kwargs) @@ -1725,38 +1304,12 @@ def cf_var_combine( computing_meta=False, **kwargs, ): - """Return a tuple of partial metrics relating to the array variance. - - The tuple is a 7-tuple that contains, in the order given, the - following variables: - - ======== ============================================================ - Variable Description - ======== ============================================================ - N Partial sample size - - var Partial sum of V1*(variance + mean^2) - - avg Unweighted partial sum - - V1 Partial sum of weights - - V2 Partial sum of squares of weights - - ddof Delta degrees of freedom - - weighted Whether or not the population is weighted - ======== ============================================================ - - For further information, see: - https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics - - """ + """TODO.""" d = {} weighted = "V2" in flatten(pairs) - keys = ("part", "wsum") + keys = ("part", "sum") if weighted: keys += ("V1", "V2") @@ -1783,23 +1336,7 @@ def cf_var_agg( ddof=None, **kwargs, ): - """Calculate the variance of the array and return it with the sample - size. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: 7-`tuple` - - sub_samples: optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the variance. - - """ + """TODO.""" d = cf_var_combine(pairs, axis, computing_meta, **kwargs) if computing_meta: return d @@ -1808,33 +1345,47 @@ def cf_var_agg( V2 = d.get("V2") weighted = V2 is not None + wsum = d["sum"] + var = d["part"] - wsum * wsum / V1 + + # Note: var is currently the global value of V1 * sigma**2, where + # sigma is the global weighted biased (i.e. ddof=0) variance. + if ddof == 0: # intended equality with zero # Weighted or unweighted variance with ddof=0 f = 1 / V1 elif not weighted: - # Unweighted variance with any non-zero value of ddof + # Unweighted variance with any non-zero value of ddof. f = 1 / (V1 - ddof) elif ddof == 1: - # Weighted variance with ddof=1 - # - # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance + # Weighted variance with ddof=1. For details see # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights f = V1 / (V1 * V1 - V2) else: raise ValueError( - "Can only calculate a weighted variance with a ddof=0 or " - f"ddof=1: Got {ddof!r}" + "Can only calculate a weighted variance with ddof=0 or ddof=1: " + f"Got {ddof!r}" ) - wsum = d["wsum"] - var = f * (d["part"] - wsum * wsum / V1) + # Calculate the global variance, with the specified weighting and + # ddof. + var = f * var + + # Note: var is now the global value of sigma**2 var = mask_small_sample_size(var, d["N"], axis, mtol, original_shape) return var -def cf_var( a, axis=None, weights=None, keepdims=False, mtol=None, - ddof=None, split_every=None ): +def cf_var( + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + ddof=None, + split_every=None, +): """TODODASK.""" dtype = float return reduction( @@ -1851,63 +1402,3 @@ def cf_var( a, axis=None, weights=None, keepdims=False, mtol=None, meta=np.array((), dtype=dtype), weights=weights, ) - - -def cf_sd_agg( - pairs, - axis=None, - dtype="f8", - computing_meta=False, - mtol=None, - original_shape=None, - ddof=None, - **kwargs, -): - """Calculate the variance of the array and return it with the sample - size. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: 7-`tuple` - - sub_samples: optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the variance. - - """ - d = cf_var_combine(pairs, axis, computing_meta, **kwargs) - if computing_meta: - return d - - V2 = d.get("V2") - weighted = V2 is not None - - V1 = d["V1"] - if ddof == 0: # intended equality with zero - # Weighted or unweighted variance with ddof=0 - f = 1 / V1 - elif not weighted: - # Unweighted variance with any non-zero value of ddof - f = 1 / (V1 - ddof) - elif ddof == 1: - # Weighted variance with ddof=1 - # - # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance - # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights - f = V1 / (V1 * V1 - V2) - else: - raise ValueError( - "Can only calculate a weighted variance with a ddof=0 or " - f"ddof=1: Got {ddof!r}" - ) - - wsum = d["sum"] - var = f * (d["part"] - (wsum * wsum) / V1) - - var = mask_small_sample_size(var, d["N"], axis, mtol, original_shape) - return var diff --git a/cf/data/data.py b/cf/data/data.py index e8eb97a1cc..1c3fbbaf6f 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -14,7 +14,6 @@ import numpy as np from dask.array import Array from dask.array.core import normalize_chunks -from dask.array.routines import result_type from dask.base import is_dask_collection, tokenize from dask.core import flatten from dask.highlevelgraph import HighLevelGraph @@ -36,7 +35,6 @@ abspath, ) from ..functions import atol as cf_atol -from ..functions import broadcast_array from ..functions import chunksize as cf_chunksize from ..functions import default_netCDF_fillvals from ..functions import fm_threshold as cf_fm_threshold @@ -50,32 +48,6 @@ NetCDFArray, UMArray, ) -from .collapse_functions import ( - mean_abs_f, - mean_abs_ffinalise, - mean_abs_fpartial, - mean_f, - mean_ffinalise, - mean_fpartial, - root_mean_square_f, - root_mean_square_ffinalise, - root_mean_square_fpartial, - sd_f, - sd_ffinalise, - sd_fpartial, - sum_of_squares_f, - sum_of_squares_ffinalise, - sum_of_squares_fpartial, - sw2_f, - sw2_ffinalise, - sw2_fpartial, - sw_f, - sw_ffinalise, - sw_fpartial, - var_f, - var_ffinalise, - var_fpartial, -) from .creation import ( compressed_to_dask, convert_to_builtin_type, @@ -4950,773 +4922,21 @@ def __neg__(self): """ return self._unary_operation("__neg__") - def __invert__(self): - """The unary bitwise operation ``~`` - - x.__invert__() <==> ~x - - """ - return self._unary_operation("__invert__") - - def __pos__(self): - """The unary arithmetic operation ``+`` - - x.__pos__() <==> +x - - """ - return self._unary_operation("__pos__") - - @_deprecated_kwarg_check("i") - @_inplace_enabled(default=False) - def _collapse( - self, - func, - fpartial, - ffinalise, - axes=None, - squeeze=False, - weights=None, - mtol=1, - units=None, - inplace=False, - i=False, - _preserve_partitions=False, - **kwargs, - ): - """Collapse the data. - - :Parameters: - - func: function - - fpartial: function - - ffinalise: function - - axes: (sequence of) `int`, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. - - squeeze: `bool`, optional - If False then the axes which are collapsed are left in the - result as axes with size 1. In this case the result will - broadcast correctly against the original array. By default - collapsed axes are removed. - - weights: *optional* - - {{inplace: `bool`, optional}} - - {{i: deprecated at version 3.0.0}} - - _preserve_partitions: `bool`, optional - If True then preserve the shape of the partition matrix of - the input data, at the expense of a much slower - execution. By default, the partition matrix may be reduced - (using `varray`) to considerably speed things up. - - kwargs: *optional* - - :Returns: - - `Data` or `None` - - """ - d = _inplace_enabled_define_and_cleanup(self) - ndim = d._ndim - self_axes = d._axes - self_shape = d._shape - - original_self_axes = self_axes[:] - - if axes is None: - # Collapse all axes - axes = list(range(ndim)) - n_collapse_axes = ndim - n_non_collapse_axes = 0 - Nmax = d._size - elif not axes and axes != 0: - # Collapse no axes - if inplace: - d = None - return d - else: - # Collapse some (maybe all) axes - axes = sorted(d._parse_axes(axes)) # , '_collapse')) - n_collapse_axes = len(axes) - n_non_collapse_axes = ndim - n_collapse_axes - Nmax = 1 - for i in axes: - Nmax *= self_shape[i] - # --- End: if - - # ------------------------------------------------------------- - # Parse the weights. - # - # * Change the keys from dimension names to the integer - # positions of the dimensions. - # - # * Make sure all non-null weights are Data objects. - # ------------------------------------------------------------ - if weights is not None: - if not isinstance(weights, dict): - # If the shape of the weights is not the same as the - # shape of the data array then the weights are assumed - # to span the collapse axes in the order in which they - # are given - if np.shape(weights) == self_shape: - weights = {tuple(self_axes): weights} - else: - weights = {tuple([self_axes[i] for i in axes]): weights} - - else: - weights = weights.copy() - weights_axes = set() - for key, value in tuple(weights.items()): - del weights[key] - key = d._parse_axes(key) - if weights_axes.intersection(key): - raise ValueError("Duplicate weights axis") - - weights_axes.update(key) - weights[tuple([self_axes[i] for i in key])] = value - # --- End: for - - if not weights_axes.intersection(axes): - # Ignore all of the weights if none of them span - # any collapse axes - weights = {} - # --- End: if - - for key, weight in tuple(weights.items()): - if weight is None or np.size(weight) == 1: - # Ignore undefined weights and size 1 weights - del weights[key] - continue - - weight_ndim = np.ndim(weight) - if weight_ndim != len(key): - raise ValueError( - "Can't collapse: Incorrect number of weights " - "axes (%d != %d)" % (weight.ndim, len(key)) - ) - - if weight_ndim > ndim: - raise ValueError( - "Can't collapse: Incorrect number of weights " - "axes (%d > %d)" % (weight.ndim, ndim) - ) - - for n, axis in zip(np.shape(weight), key): - if n != self_shape[self_axes.index(axis)]: - raise ValueError( - "Can't collapse: Incorrect weights " - "shape {!r}".format(np.shape(weight)) - ) - # --- End: for - - # Convert weight to a data object, if necessary. - weight = type(self).asdata(weight) - - if weight.dtype.char in ("S", "U"): - # Ignore string-valued weights - del weights[key] - continue - - weights[key] = weight - # --- End: for - # --- End: if - - if axes != list(range(n_non_collapse_axes, ndim)): - transpose_iaxes = [i for i in range(ndim) if i not in axes] + axes - d.transpose(transpose_iaxes, inplace=True) - - if weights: - # Optimize when weights span only non-partitioned axes - # (do this before permuting the order of the weight - # axes to be consistent with the order of the data - # axes) - weights = d._collapse_optimize_weights(weights) - - # Permute the order of the weight axes to be - # consistent with the order of the data axes - self_axes = d._axes - for key, w in tuple(weights.items()): - key1 = tuple([axis for axis in self_axes if axis in key]) - - if key1 != key: - w = w.transpose([key.index(axis) for axis in key1]) - - del weights[key] - ikey = tuple([self_axes.index(axis) for axis in key1]) - - weights[ikey] = w - - # Add the weights to kwargs - kwargs["weights"] = weights - - # If the input data array 'fits' in one chunk of memory, then - # make sure that it has only one partition - if ( - not _preserve_partitions - and d._pmndim - and d.fits_in_one_chunk_in_memory(d.dtype.itemsize) - ): - d.varray - - # ------------------------------------------------------------- - # Initialise the output data array - # ------------------------------------------------------------- - new = d[(Ellipsis,) + (0,) * n_collapse_axes] - - # new._auxiliary_mask = None - for partition in new.partitions.matrix.flat: - # Do this so as not to upset the ref count on the - # parittion's of d - del partition.subarray - - # d.to_memory() - - # save = not new.fits_in_memory(new.dtype.itemsize) - keep_in_memory = new.fits_in_memory(new.dtype.itemsize) - - datatype = d.dtype - - if units is None: - new_units = new.Units - else: - new_units = units - - p_axes = new._axes[:n_non_collapse_axes] - p_units = new_units - - c_slice = (slice(None),) * n_collapse_axes - - config = new.partition_configuration( - readonly=False, auxiliary_mask=None, extra_memory=False # DCH ??x - ) - - processed_partitions = [] - for pmindex, partition in np.ndenumerate(new.partitions.matrix): - if partition._process_partition: - # Only process the partition if it is flagged - partition.open(config) - - # Save the position of the partition in the partition - # matrix - partition._pmindex = pmindex - - partition.axes = p_axes - partition.flip = [] - partition.part = [] - partition.Units = p_units - - if squeeze: - # Note: parentheses for line continuation (not a tuple): - partition.location = partition.location[ - :n_non_collapse_axes - ] - partition.shape = partition.shape[:n_non_collapse_axes] - - indices = partition.indices[:n_non_collapse_axes] + c_slice - - partition.subarray = d._collapse_subspace( - func, - fpartial, - ffinalise, - indices, - n_non_collapse_axes, - n_collapse_axes, - Nmax, - mtol, - _preserve_partitions=_preserve_partitions, - _parallelise_collapse_subspace=False, - **kwargs, - ) - - partition.close(keep_in_memory=keep_in_memory) - - # Add each partition to a list of processed partitions - processed_partitions.append(partition) - # --- End: if - # --- End: for - - # processed_partitions contains a list of all the partitions - # that have been processed on this rank. In the serial case - # this is all of them and this line of code has no - # effect. Otherwise the processed partitions from each rank - # are distributed to every rank and processed_partitions now - # contains all the processed partitions from every rank. - processed_partitions = self._share_partitions( - processed_partitions, False - ) - - # Put the processed partitions back in the partition matrix - # according to each partitions _pmindex attribute set above. - pm = new.partitions.matrix - for partition in processed_partitions: - pm[partition._pmindex] = partition - - p_datatype = partition.subarray.dtype - if datatype != p_datatype: - datatype = np.result_type(p_datatype, datatype) - # --- End: for - - new._Units = new_units - new.dtype = datatype - - if squeeze: - new._axes = p_axes - new._ndim = ndim - n_collapse_axes - new._shape = new._shape[: new._ndim] - else: - new_axes = new._axes - if new_axes != original_self_axes: - iaxes = [new_axes.index(axis) for axis in original_self_axes] - new.transpose(iaxes, inplace=True) - # --- End: if - - # ------------------------------------------------------------ - # Update d in place and return - # ------------------------------------------------------------ - d.__dict__ = new.__dict__ - - return d - - def _collapse_subspace( - self, - func, - fpartial, - ffinalise, - indices, - n_non_collapse_axes, - n_collapse_axes, - Nmax, - mtol, - weights=None, - _preserve_partitions=False, - _parallelise_collapse_subspace=True, - **kwargs, - ): - """Collapse a subspace of a data array. - - If set, *weights* and *kwargs* are passed to the function call. If - there is a *weights* keyword argument then this should either evaluate - to False or be a dictionary of weights for at least one of the data - dimensions. - - :Parameters: - - func : function - - fpartial : function - - ffinalise : function - - indices: tuple - The indices of the master array which would create the - subspace. - - n_non_collapse_axes : int - The number of data array axes which are not being - collapsed. It is assumed that they are in the slowest moving - positions. - - n_collapse_axes : int - The number of data array axes which are being collapsed. It is - assumed that they are in the fastest moving positions. - - weights : dict, optional - - kwargs : *optional* - - :Returns: - - `list` - - **Examples:** - - """ - - ndim = self._ndim - - master_shape = self.shape - - data = self[indices] - - # If the input data array 'fits' in one chunk of memory, then - # make sure that it has only one partition - if ( - not _preserve_partitions - and data._pmndim - and data.fits_in_memory(data.dtype.itemsize) - ): - data.varray - - # True iff at least two, but not all, axes are to be - # collapsed. - reshape = 1 < n_collapse_axes < ndim - - out = None - - if n_collapse_axes == ndim: - # All axes are to be collapsed - kwargs.pop("axis", None) - else: - # At least one axis, but not all axes, are to be - # collapsed. It is assumed that the collapse axes are in - # the last (fastest varying) positions (-1, -2, ...). We - # set kwargs['axis']=-1 (actually we use the +ve integer - # equivalent of -1) if there is more then one collapse - # axis because, in this case (i.e. reshape is True), we - # will reshape everything. - kwargs["axis"] = ndim - n_collapse_axes - - masked = False - - sub_samples = 0 - - # pda_args = data.pda_args(revert_to_file=True) #, readonly=True) - config = data.partition_configuration(readonly=True) - - # Flag which partitions will be processed on this rank. If - # _parallelise_collapse_subspace is False then all partitions - # will be flagged for processing. - data._flag_partitions_for_processing(_parallelise_collapse_subspace) - - for i, partition in enumerate(data.partitions.matrix.flat): - if partition._process_partition: - # Only process a partition if flagged - partition.open(config) - array = partition.array - - p_masked = partition.masked - - if p_masked: - masked = True - if array.mask.all(): - # The array is all missing data - partition.close() - continue - - # Still here? Then there are some non-missing sub-array - # elements. - if weights is not None: - w = self._collapse_create_weights( - array, - partition.indices, - indices, - master_shape, - weights, - n_non_collapse_axes, - n_collapse_axes, - ) - wmin = w.min() - if wmin < 0: - raise ValueError( - "Can't collapse with negative weights" - ) - - if wmin == 0: - # Mask the array where the weights are zero - array = np.ma.masked_where(w == 0, array, copy=True) - if array.mask.all(): - # The array is all missing data - partition.close() - continue - # --- End: if - - kwargs["weights"] = w - # --- End: if - - partition.close() - - if reshape: - # At least two, but not all, axes are to be collapsed - # => we need to reshape the array and the weights. - shape = array.shape - ndim = array.ndim - new_shape = shape[:n_non_collapse_axes] - new_shape += (reduce(mul, shape[n_non_collapse_axes:]),) - array = np.reshape(array.copy(), new_shape) - - if weights is not None: - w = kwargs["weights"] - if w.ndim < ndim: - # The weights span only collapse axes (as - # opposed to spanning all axes) - new_shape = (w.size,) - - kwargs["weights"] = np.reshape(w, new_shape) - # --- End: if - - p_out = func(array, masked=p_masked, **kwargs) - - if out is None: - if ( - not _parallelise_collapse_subspace - and data.partitions.size == i + 1 - ): - # There is exactly one partition so we are done - out = p_out - break - # --- End: if - out = fpartial(p_out) - else: - out = fpartial(out, p_out) - # --- End: if - - sub_samples += 1 - - # --- End: if - # --- End: for - - # In the case that the inner loop is not parallelised, - # just finalise. - out = self._collapse_finalise( - ffinalise, - out, - sub_samples, - masked, - Nmax, - mtol, - data, - n_non_collapse_axes, - ) - # # --- End: if - - return out - - @classmethod - def _collapse_finalise( - cls, - ffinalise, - out, - sub_samples, - masked, - Nmax, - mtol, - data, - n_non_collapse_axes, - ): - """Finalise a collapse over a data array.""" - if out is not None: - # Finalise - N, out = ffinalise(out, sub_samples) - out = cls._collapse_mask(out, masked, N, Nmax, mtol) - else: - # no data - return all masked - out = np.ma.masked_all( - data.shape[:n_non_collapse_axes], data.dtype - ) - - return out - - @staticmethod - def _collapse_mask(array, masked, N, Nmax, mtol): - """Re-masks a masked array to reflect a collapse. - - :Parameters: - - array: numpy array - - masked: bool - - N: numpy array-like - - Nmax: int - - mtol: numpy array-like - - :Returns: - - numpy array - - """ - if masked and mtol < 1: - x = N < (1 - mtol) * Nmax - if x.any(): - array = np.ma.masked_where(x, array, copy=False) - # --- End: if - - return array - - @staticmethod - def _collapse_create_weights( - array, - indices, - master_indices, - master_shape, - master_weights, - n_non_collapse_axes, - n_collapse_axes, - ): - """Collapse weights of an array. - - :Parameters: - - array : numpy array - - indices : tuple - - master_indices : tuple - - master_shape : tuple - - master_weights : dict - - n_non_collapse_axes : int - The number of array axes which are not being collapsed. It - is assumed that they are in the slowest moving positions. - - n_collapse_axes : int - The number of array axes which are being collapsed. It is - assumed that they are in the fastest moving positions. - - :Returns: - - `numpy array or `None` - - **Examples:** - - """ - array_shape = array.shape - array_ndim = array.ndim - - weights_indices = [] - for master_index, index, size in zip( - master_indices, indices, master_shape - ): - start, stop, step = master_index.indices(size) - - size1, mod = divmod(stop - start - 1, step) - - start1, stop1, step1 = index.indices(size1 + 1) - - size2, mod = divmod(stop1 - start1, step1) - - if mod != 0: - size2 += 1 - - start += start1 * step - step *= step1 - stop = start + (size2 - 1) * step + 1 - - weights_indices.append(slice(start, stop, step)) - - base_shape = (1,) * array_ndim - - masked = False - zero_weights = False - - weights = [] - for key, weight in master_weights.items(): - shape = list(base_shape) - index = [] - for i in key: - shape[i] = array_shape[i] - index.append(weights_indices[i]) - - weight = weight[tuple(index)].array - - zero_weights = zero_weights or (weight.min() <= 0) - - masked = masked or np.ma.isMA(weight) - - if weight.ndim != array_ndim: - # Make sure that the weight has the same number of - # dimensions as the array - weight = weight.reshape(shape) - - weights.append(weight) - - weights_out = weights[0] - - if len(weights) > 1: - # There are two or more weights, so create their product - # (can't do this in-place because of broadcasting woe) - for w in weights[1:]: - weights_out = weights_out * w - - weights_out_shape = weights_out.shape - - if ( - not masked - and weights_out_shape[:n_non_collapse_axes] - == base_shape[:n_non_collapse_axes] - ): - # The input weights are not masked and only span collapse axes - weights_out = weights_out.reshape( - weights_out_shape[n_non_collapse_axes:] - ) - - if ( - weights_out_shape[n_non_collapse_axes:] - != array_shape[n_non_collapse_axes:] - ): - # The input weights span some, but not all, of the - # collapse axes, so broadcast the weights over all - # collapse axes - weights_out = broadcast_array( - weights_out, array_shape[n_non_collapse_axes:] - ) - else: - if weights_out_shape != array_shape: - # Either a) The input weights span at least one - # non-collapse axis, so broadcast the weights over all - # axes or b) The weights contain masked values - weights_out = broadcast_array(weights_out, array_shape) - - if masked and np.ma.isMA(array): - if not (array.mask | weights_out.mask == array.mask).all(): - raise ValueError( - "The output weights mask {} is not compatible with " - "the array mask {}.".format( - weights_out.mask, array.mask - ) - ) - # --- End: if - - return weights_out - - def _collapse_optimize_weights(self, weights): - """Optimise when weights span only non-partitioned axes. - - weights: `dict` - - """ - non_partitioned_axes = set(self._axes).difference(self._pmaxes) - - x = [] - new_key = () - for key in weights: - if non_partitioned_axes.issuperset(key): - x.append(key) - new_key += key - # --- End: for + def __invert__(self): + """The unary bitwise operation ``~`` - if len(x) > 1: - reshaped_weights = [] - for key in x: - w = weights.pop(key) - w = w.array - shape = [ - (w.shape[key.index(axis)] if axis in key else 1) - for axis in new_key - ] - w = w.reshape(shape) + x.__invert__() <==> ~x - reshaped_weights.append(w) + """ + return self._unary_operation("__invert__") - # Create their product - new_weight = reshaped_weights[0] - for w in reshaped_weights[1:]: - new_weight = new_weight * w + def __pos__(self): + """The unary arithmetic operation ``+`` - weights[new_key] = type(self)(new_weight) + x.__pos__() <==> +x - return weights + """ + return self._unary_operation("__pos__") # ---------------------------------------------------------------- # Private attributes @@ -7705,6 +6925,7 @@ def maximum( axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, i=False, _preserve_partitions=False, @@ -7731,23 +6952,20 @@ def maximum( **Examples:** """ - # TODODASK: Placeholder for the real thing, that takes into - # account axes=axes, squeeze=squeeze, mtol=mtol, - # inplace=inplace. - # - # This is only here for now, in this form, to ensure that - # cf.read works - return self._get_dask().max() - - # return self._collapse(max_f, max_fpartial, max_ffinalise, axes=axes, - # squeeze=squeeze, mtol=mtol, inplace=inplace, - # _preserve_partitions=_preserve_partitions) + return self.max( + axes=axes, + squeeze=squeeze, + mtol=mtol, + split_every=split_every, + inplace=inplace, + ) def maximum_absolute_value( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, _preserve_partitions=False, ): @@ -7785,16 +7003,21 @@ def maximum_absolute_value( """ - return self._collapse( - max_abs_f, - max_abs_fpartial, - max_abs_ffinalise, - axes=axes, - squeeze=squeeze, + from .collapse_functions import cf_max_abs as collapse + + d = _inplace_enabled_define_and_cleanup(self) + + dx = d._get_dask() + dx = collapse( + dx, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + d._set_dask(dx, reset_mask_hardness=True) + + return d @_deprecated_kwarg_check("i") def minimum( @@ -7830,15 +7053,12 @@ def minimum( **Examples:** """ - return self._collapse( - min_f, - min_fpartial, - min_ffinalise, + return self.min( axes=axes, squeeze=squeeze, mtol=mtol, + split_every=split_every, inplace=inplace, - _preserve_partitions=_preserve_partitions, ) def minimum_absolute_value( @@ -7883,16 +7103,21 @@ def minimum_absolute_value( """ - return self._collapse( - min_abs_f, - min_abs_fpartial, - min_abs_ffinalise, - axes=axes, - squeeze=squeeze, + from .collapse_functions import cf_min_abs as collapse + + d = _inplace_enabled_define_and_cleanup(self) + + dx = d._get_dask() + dx = collapse( + dx, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + d._set_dask(dx, reset_mask_hardness=True) + + return d @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @@ -7907,30 +7132,23 @@ def mean( split_every=None, i=False, ): - from .collapse_functions import cf_mean as func + from .collapse_functions import cf_mean as collapse d = _inplace_enabled_define_and_cleanup(self) - return _collapse( - d, - func, + + dx = d._get_dask() + dx = collapse( + dx, axis=axes, weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) + d._set_dask(dx, reset_mask_hardness=True) + + return d - # @_deprecated_kwarg_check("i") - # def mean( - # self, - # axes=None, - # squeeze=False, - # mtol=1, - # weights=None, - # inplace=False, - # i=False, - # _preserve_partitions=False, - # ): # """Collapse axes with their mean. # # The mean is unweighted by default, but may be weighted (see the @@ -8086,17 +7304,6 @@ def mean( # [[[9.666666666666666 10.666666666666666 -- 12.666666666666666]]] # # """ - # return self._collapse( - # mean_f, - # mean_fpartial, - # mean_ffinalise, - # axes=axes, - # squeeze=squeeze, - # weights=weights, - # mtol=mtol, - # inplace=inplace, - # _preserve_partitions=_preserve_partitions, - # ) def mean_absolute_value( self, @@ -8104,6 +7311,7 @@ def mean_absolute_value( squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, _preserve_partitions=False, ): @@ -8139,24 +7347,32 @@ def mean_absolute_value( """ - return self._collapse( - mean_abs_f, - mean_abs_fpartial, - mean_abs_ffinalise, - axes=axes, - squeeze=squeeze, + from .collapse_functions import cf_mean_abs as collapse + + d = _inplace_enabled_define_and_cleanup(self) + + dx = d._get_dask() + dx = collapse( + dx, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + d._set_dask(dx, reset_mask_hardness=True) + + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def integral( self, axes=None, squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, _preserve_partitions=False, ): @@ -8222,36 +7438,31 @@ def integral( **Examples:** """ - if weights is None: - units = None - else: - units = self.Units - if not units: - units = Units("1") - - weights_units = getattr(weights, "Units", None) - if weights_units is not None: - units = units * weights_units - else: - for w in weights.values(): - weights_units = getattr(w, "Units", None) - if weights_units is not None: - units = units * weights_units - # --- End: if - - return self._collapse( - sum_f, - sum_fpartial, - sum_ffinalise, + d = _inplace_enabled_define_and_cleanup(self) + d.sum( axes=axes, - squeeze=squeeze, weights=weights, + squeeze=squeeze, mtol=mtol, - inplace=inplace, - units=units, - _preserve_partitions=_preserve_partitions, + split_every=split_every, + inplace=True, ) + new_units = None + if weights is not None: + weights_units = getattr(weights, "Units", None) + if weights_units: + units = self.Units + if units: + new_units = units * weights_units + else: + new_units = weights_units + + if new_units is not None: + d.override_units(new_units, inplace=True) + + return d + @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") @@ -8260,21 +7471,25 @@ def sample_size( axes=None, squeeze=False, mtol=1, - inplace=False, split_every=None, + inplace=False, i=False, ): - from .collapse_functions import cf_sample_size + from .collapse_functions import cf_sample_size as collapse d = _inplace_enabled_define_and_cleanup(self) - d = _collapse( - d, - cf_sample_size, + + dx = d._get_dask() + dx = collapse( + dx, axis=axes, + weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) + d._set_dask(dx, reset_mask_hardness=True) + return d @property @@ -10831,17 +10046,20 @@ def mid_range( **Examples:** """ - from .collapse_functions import cf_mid_range + from .collapse_functions import cf_mid_range as collapse d = _inplace_enabled_define_and_cleanup(self) - d = _collapse( - d, - cf_mid_range, + + dx = d._get_dask() + dx = collapse( + dx, axis=axes, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) + d._set_dask(dx, reset_mask_hardness=True) + return d @daskified(_DASKIFIED_VERBOSE) @@ -11128,17 +10346,22 @@ def root_mean_square( **Examples:** """ - return self._collapse( - root_mean_square_f, - root_mean_square_fpartial, - root_mean_square_ffinalise, - axes=axes, - squeeze=squeeze, + from .collapse_functions import cf_rms as collapse + + d = _inplace_enabled_define_and_cleanup(self) + + dx = d._get_dask() + dx = collapse( + dx, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + d._set_dask(dx, reset_mask_hardness=True) + + return d @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @@ -12725,8 +11948,11 @@ def range( :Parameters: split_every: `int` or `dict`, optional - Determines the depth of the recursive aggregation. If - set to a number greater an oe equal to the number of + Determines the depth of the recursive aggregation. See + `dask.array.reduction` for details. + + + set to a number greater than to equal to the number of input chunks, the aggregation will be performed in two steps, one ``chunk`` function per input chunk and a single ``aggregate`` function at the end. If set to @@ -12758,17 +11984,20 @@ def range( **Examples:** """ - from .collapse_functions import cf_range + from .collapse_functions import cf_range as collapse d = _inplace_enabled_define_and_cleanup(self) - d = _collapse( - d, - cf_range, + + dx = d._get_dask() + dx = collapse( + dx, axis=axes, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) + d._set_dask(dx, reset_mask_hardness=True) + return d @daskified(_DASKIFIED_VERBOSE) @@ -12836,25 +12065,32 @@ def sum( i=False, _preserve_partitions=False, ): - from .collapse_functions import cf_sum + from .collapse_functions import cf_sum as collapse d = _inplace_enabled_define_and_cleanup(self) - return _collapse( - d, - cf_sum, + + dx = d._get_dask() + dx = collapse( + dx, axis=axes, weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) + d._set_dask(dx, reset_mask_hardness=True) + + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def sum_of_squares( self, axes=None, squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, _preserve_partitions=False, ): @@ -12889,30 +12125,38 @@ def sum_of_squares( """ - units = self.Units - if units: - units = units ** 2 + from .collapse_functions import cf_sum_of_squares as collapse - return self._collapse( - sum_of_squares_f, - sum_of_squares_fpartial, - sum_of_squares_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + + dx = d._get_dask() + dx = collapse( + dx, + axis=axes, weights=weights, - units=units, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + d._set_dask(dx, reset_mask_hardness=True) + + units = self.Units + if units: + d.override_units(units ** 2, inplace=True) + + return d + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") + @_inplace_enabled(default=False) + # @_deprecated_kwarg_check("i") def sum_of_weights( self, axes=None, squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, i=False, _preserve_partitions=False, @@ -12930,6 +12174,10 @@ def sum_of_weights( squeeze : bool, optional + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + {[inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} @@ -12942,40 +12190,49 @@ def sum_of_weights( **Examples:** """ + from .collapse_functions import cf_sum_of_weights as collapse + + d = _inplace_enabled_define_and_cleanup(self) + if weights is None: - units = Units() + units = _units_None else: - weights_units = getattr(weights, "Units", None) - if weights_units is not None: - units = weights_units - else: - units = Units("1") - for w in weights.values(): - weights_units = getattr(w, "Units", None) - if weights_units is not None: - units = units * weights_units - # --- End: if + units = getattr(weights, "Units", None) + if units is None: + units = _units_None - return self._collapse( - sw_f, - sw_fpartial, - sw_ffinalise, - axes=axes, - squeeze=squeeze, + dx = d._get_dask() + dx = collapse( + dx, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - units=units, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + dx = da.sqrt(dx) + d._set_dask(dx, reset_mask_hardness=True) + + units = _units_None + if weights is not None: + units = getattr(weights, "Units", None) + if units is None: + units = _units_None + + d.override_units(units, inplace=True) + + return d + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") + @_inplace_enabled(default=False) def sum_of_weights2( self, axes=None, squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, i=False, _preserve_partitions=False, @@ -13005,32 +12262,33 @@ def sum_of_weights2( **Examples:** """ - if weights is None: - units = Units() - else: - weights_units = getattr(weights, "Units", None) - if weights_units is not None: - units = weights_units - else: - units = Units("1") - for w in weights.values(): - weights_units = getattr(w, "Units", None) - if weights_units is not None: - units = units * (weights_units ** 2) - # --- End: if + from .collapse_functions import cf_sum_of_weights2 as collapse - return self._collapse( - sw2_f, - sw2_fpartial, - sw2_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + + dx = d._get_dask() + dx = collapse( + dx, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - units=units, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + dx = da.sqrt(dx) + d._set_dask(dx, reset_mask_hardness=True) + + units = _units_None + if weights is not None: + units = getattr(weights, "Units", None) + if units is None: + units = _units_None + else: + units = units ** 2 + + d.override_units(units, inplace=True) + + return d @_deprecated_kwarg_check("i") def sd( @@ -13184,18 +12442,16 @@ def sd( """ - return self._collapse( - sd_f, - sd_fpartial, - sd_ffinalise, + d = _inplace_enabled_define_and_cleanup(self) + d.var( axes=axes, - squeeze=squeeze, weights=weights, mtol=mtol, - ddof=ddof, - inplace=inplace, - _preserve_partitions=_preserve_partitions, + split_every=split_every, + inplace=True, ) + d **= 0.5 + return d @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @@ -13206,88 +12462,34 @@ def var( weights=None, squeeze=False, mtol=1, - ddof=0, + ddof=0, inplace=False, split_every=None, i=False, _preserve_partitions=False, ): from .collapse_functions import cf_var - + + collapse = partial(cf_var, ddof=ddof) + d = _inplace_enabled_define_and_cleanup(self) - _collapse( - d, - partial(cf_var, ddof=ddof), + + dx = d._get_dask() + dx = collapse( + dx, axis=axes, weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) + d._set_dask(dx, reset_mask_hardness=True) units = d.Units if units: d.override_units(units ** 2, inplace=True) return d - -# @_deprecated_kwarg_check("i") -# def var( -# self, -# axes=None, -# squeeze=False, -# weights=None, -# mtol=1, -# ddof=0, -# inplace=False, -# i=False, -# _preserve_partitions=False, -# ): -# """Collapse axes with their weighted variance. -# -# The units of the returned array are the square of the units of the -# array. -# -# .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `range`, `sum`, -# `sd`, `stats` -# -# :Parameters: -# -# axes : (sequence of) int, optional -# -# squeeze : bool, optional -# -# weights : -# -# {{inplace: `bool`, optional}} -# -# {{i: deprecated at version 3.0.0}} -# -# :Returns: -# -# `Data` or `None` -# The collapsed array. -# -# **Examples:** -# -# """ -# units = self.Units -# if units: -# units = units ** 2 -# -# return self._collapse( -# var_f, -# var_fpartial, -# var_ffinalise, -# axes=axes, -# squeeze=squeeze, -# weights=weights, -# mtol=mtol, -# units=units, -# ddof=ddof, -# inplace=inplace, -# _preserve_partitions=_preserve_partitions, -# ) def section( self, axes, stop=None, chunks=False, min_step=1, mode="dictionary" @@ -13349,7 +12551,7 @@ def section( @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") - def sumw( + def sum( self, axes=None, weights=None, @@ -13359,18 +12561,21 @@ def sumw( split_every=None, i=False, ): - from .collapse_functions import cf_sumw + from .collapse_functions import cf_sum as collapse d = _inplace_enabled_define_and_cleanup(self) - d = _collapse( - d, - cf_sumw, + + dx = d._get_dask() + dx = collapse( + dx, axis=axes, weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) + d._set_dask(dx, reset_mask_hardness=True) + return d # ---------------------------------------------------------------- @@ -13389,21 +12594,24 @@ def max( axes=None, squeeze=False, mtol=1, - inplace=False, split_every=None, + inplace=False, i=False, ): - from .collapse_functions import cf_max + from .collapse_functions import cf_max as collapse d = _inplace_enabled_define_and_cleanup(self) - d = _collapse( - d, - cf_max, + + dx = d._get_dask() + dx = collapse( + dx, axis=axes, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) + d._set_dask(dx, reset_mask_hardness=True) + return d @daskified(_DASKIFIED_VERBOSE) @@ -13419,17 +12627,20 @@ def min( i=False, _preserve_partitions=False, ): - from .collapse_functions import cf_min + from .collapse_functions import cf_min as collapse d = _inplace_enabled_define_and_cleanup(self) - d = _collapse( - d, - cf_min, + + dx = d._get_dask() + dx = collapse( + dx, axis=axes, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) + d._set_dask(dx, reset_mask_hardness=True) + return d def standard_deviation( @@ -13451,7 +12662,6 @@ def standard_deviation( mtol=mtol, ddof=ddof, inplace=inplace, - _preserve_partitions=_preserve_partitions, ) def variance( @@ -13703,27 +12913,3 @@ def _where_broadcastable(data, x, name): ) return True - - -def _collapse( - d, - collapse_func, - axis=None, - weights=None, - keepdims=True, - split_every=None, - mtol=1, -): - """TODODASK.""" - dx = d._get_dask() - dx = collapse_func( - dx, - axis=axis, - weights=weights, - keepdims=keepdims, - split_every=split_every, - mtol=mtol, - ) - d._set_dask(dx, reset_mask_hardness=True) - - return d diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 40658c1111..f2ac2a8498 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -43,6 +43,7 @@ ma[0, 3, :, 3] = np.ma.masked ma[1, 2, 3, :] = np.ma.masked +mw = np.ma.array(w, mask=ma.mask) # If True, all tests that will not pass temporarily due to the LAMA-to-Dask # migration will be skipped. These skips will be incrementally removed as the @@ -81,6 +82,7 @@ class DataTest(unittest.TestCase): a = a w = w ma = ma + mw = mw ones = ones test_only = [] @@ -1844,58 +1846,56 @@ def test_Data_flip(self): self.assertEqual(d[0].shape, (1, 4, 5)) self.assertEqual(d[-1].shape, (1, 4, 5)) - self.assertEqual(d[0].maximum(), 4 * 5) - self.assertEqual(d[-1].maximum(), 3 * 4 * 5) + self.assertEqual(d[0].max().array, 4 * 5) + self.assertEqual(d[-1].max().array, 3 * 4 * 5) for i in (2, 1): e = d.flip(i) self.assertEqual(e[0].shape, (1, 4, 5)) self.assertEqual(e[-1].shape, (1, 4, 5)) - self.assertEqual(e[0].maximum(), 4 * 5) - self.assertEqual(e[-1].maximum(), 3 * 4 * 5) + self.assertEqual(e[0].max().array, 4 * 5) + self.assertEqual(e[-1].max().array, 3 * 4 * 5) i = 0 e = d.flip(i) self.assertEqual(e[0].shape, (1, 4, 5)) self.assertEqual(e[-1].shape, (1, 4, 5)) - self.assertEqual(e[0].maximum(), 3 * 4 * 5) - self.assertEqual(e[-1].maximum(), 4 * 5) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute 'datum'") - def test_Data_max(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (False, True): - d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") - self.assertEqual( - d.maximum(_preserve_partitions=pp), cf.Data(6, "metre") - ) - self.assertEqual(d.maximum(_preserve_partitions=pp).datum(), 6) - d[0, 2] = cf.masked - self.assertEqual(d.maximum(_preserve_partitions=pp), 5) - self.assertEqual(d.maximum(_preserve_partitions=pp).datum(), 5) - self.assertEqual( - d.maximum(_preserve_partitions=pp), cf.Data(0.005, "km") - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_min(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (False, True): - d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") - self.assertEqual( - d.minimum(_preserve_partitions=pp), cf.Data(1, "metre") - ) - self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 1) - d[1, 0] = cf.masked - self.assertEqual(d.minimum(_preserve_partitions=pp), 2) - self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 2) - self.assertEqual( - d.minimum(_preserve_partitions=pp), cf.Data(0.002, "km") - ) + self.assertEqual(e[0].max().array, 3 * 4 * 5) + self.assertEqual(e[-1].max().array, 4 * 5) + + # def test_Data_max(self): + # if self.test_only and inspect.stack()[0][3] not in self.test_only: + # return + # + # d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre", chunks=2) + # self.assertEqual( + # d.max().array, cf.Data(6, "metre") + # ) + # self.assertEqual(d.max().array.datum(), 6) + # d[0, 2] = cf.masked + # self.assertEqual(d.max().array, 5) + # self.assertEqual(d.max().array.datum(), 5) + # self.assertEqual( + # d.maximum(_preserve_partitions=pp), cf.Data(0.005, "km") + # ) + # + # @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") + # def test_Data_min(self): + # if self.test_only and inspect.stack()[0][3] not in self.test_only: + # return + # + # for pp in (False, True): + # d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") + # self.assertEqual( + # d.minimum(_preserve_partitions=pp), cf.Data(1, "metre") + # ) + # self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 1) + # d[1, 0] = cf.masked + # self.assertEqual(d.minimum(_preserve_partitions=pp), 2) + # self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 2) + # self.assertEqual( + # d.minimum(_preserve_partitions=pp), cf.Data(0.002, "km") + # ) def test_Data_ndindex(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: @@ -2551,62 +2551,129 @@ def test_Data__collapse_SHAPE(self): ) # --- End: for - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_max_min_sum_sum_of_squares(self): + def test_Data_max_min(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for _np, h in zip( - (np.sum, np.amin, np.amax, np.sum), - ("sum", "min", "max", "sum_of_squares"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - if h == "sum_of_squares": - b = b ** 2 + msg = None - b = _np(b, axis=-1) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) + # unmasked + d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) + for _np, h in zip( + (np.amin, np.amax), + ("min", "max"), + ): + for axes in self.axes_combinations: + b = reshape_array(self.a, axes) + if h == "sum_of_squares": + b = b ** 2 - # unweighted, masked - d = cf.Data(self.ma) - for _np, h in zip( - (np.ma.sum, np.ma.amin, np.ma.amax, np.ma.sum), - ("sum", "min", "max", "sum_of_squares"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - if h == "sum_of_squares": - b = b ** 2 + b = _np(b, axis=-1) + e = getattr(d, h)(axes=axes, squeeze=True) + if h == "sum_of_squares": + self.assertEqual(e.Units, cf.Units("m2")) - b = _np(b, axis=-1) - b = np.ma.asanyarray(b) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) + # For debugging + # msg = (f"{h}, axis={axes}, unweighted, unmasked " + # f"\ne={e.array}, \nb={b}") - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, \nb.mask={}".format( - h, axes, e.mask.array, b.mask - ), - ) + self.assertTrue( + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg + ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) + # masked + d = cf.Data(self.ma, "m", chunks=(2, 3, 2, 5)) + for _np, h in zip( + (np.ma.amin, np.ma.amax), + ("min", "max"), + ): + for axes in self.axes_combinations: + b = reshape_array(self.ma, axes) + if h == "sum_of_squares": + b = b ** 2 + + b = _np(b, axis=-1) + b = np.ma.asanyarray(b) + e = getattr(d, h)(axes=axes, squeeze=True) + if h == "sum_of_squares": + self.assertEqual(e.Units, cf.Units("m2")) + + # For debugging + # msg = (f"{h}, axis={axes}, unweighted, unmasked " + # f"\ne.mask={e.mask.array}, \nb={b}") + + self.assertTrue((e.mask.array == b.mask).all(), msg) + + # For debugging + # msg = (f"{h}, axis={axes}, unweighted, unmasked " + # f"\ne={e.array}, \nb={b}") + + self.assertTrue( + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg + ) + + def test_Data_sum_sum_of_squares(self): + if self.test_only and inspect.stack()[0][3] not in self.test_only: + return + + msg = None + + # unweighted, unmasked + d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) + for _np, h in zip( + (np.sum, np.sum), + ("sum", "sum_of_squares"), + ): + for axes in self.axes_combinations: + b = reshape_array(self.a, axes) + if h == "sum_of_squares": + b = b ** 2 + + b = _np(b, axis=-1) + e = getattr(d, h)(axes=axes, squeeze=True) + if h == "sum_of_squares": + self.assertEqual(e.Units, cf.Units("m2")) + + # For debugging + # msg = (f"{h}, axis={axes}, unweighted, unmasked " + # f"\ne={e.array}, \nb={b}") + + self.assertTrue( + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg + ) + + # unweighted, masked + d = cf.Data(self.ma, "m", chunks=(2, 3, 2, 5)) + for _np, h in zip( + (np.ma.sum, np.ma.sum), + ("sum", "sum_of_squares"), + ): + for axes in self.axes_combinations: + b = reshape_array(self.ma, axes) + if h == "sum_of_squares": + b = b ** 2 + + b = _np(b, axis=-1) + b = np.ma.asanyarray(b) + e = getattr(d, h)(axes=axes, squeeze=True) + if h == "sum_of_squares": + self.assertEqual(e.Units, cf.Units("m2")) + + # For debugging + # msg = (f"{h}, axis={axes}, unweighted, unmasked " + # f"\ne.mask={e.mask.array}, \nb={b}") + + self.assertTrue((e.mask.array == b.mask).all(), msg) + + # For debugging + # msg = (f"{h}, axis={axes}, unweighted, unmasked " + # f"\ne={e.array}, \nb={b}") + + self.assertTrue( + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg + ) + + # Need to do weighted def test_Data_percentile_median(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: @@ -2796,110 +2863,110 @@ def test_Data_mean_of_upper_decile(self): "\ne={}, \nb={}".format(axes, e.array, b), ) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_range_mid_range(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for h in ("range", "mid_range"): - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - mn = np.amin(b, axis=-1) - mx = np.amax(b, axis=-1) - if h == "range": - b = mx - mn - elif h == "mid_range": - b = (mx + mn) * 0.5 + msg = None - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) + # unweighted, unmasked + d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) + for h in ("range", "mid_range"): + for axes in self.axes_combinations: + b = reshape_array(self.a, axes) + mn = np.amin(b, axis=-1) + mx = np.amax(b, axis=-1) + if h == "range": + b = mx - mn + elif h == "mid_range": + b = (mx + mn) * 0.5 - # unweighted, masked - d = cf.Data(self.ma) - for h in ("range", "mid_range"): - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - mn = np.amin(b, axis=-1) - mx = np.amax(b, axis=-1) - if h == "range": - b = mx - mn - elif h == "mid_range": - b = (mx + mn) * 0.5 + e = getattr(d, h)(axes=axes, squeeze=True) - b = np.ma.asanyarray(b) + # For debugging + # msg = (f"{h}, axis={axes}, unweighted, unmasked " + # f"\ne={e.array}, \nb={b}") - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) + self.assertTrue( + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg + ) - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}".format(h, axes, e.mask.array, b.mask), - ) + # unweighted, masked + d = cf.Data(self.ma, chunks=(2, 3, 2, 5)) + for h in ("range", "mid_range"): + for axes in self.axes_combinations: + b = reshape_array(self.ma, axes) + mn = np.amin(b, axis=-1) + mx = np.amax(b, axis=-1) + if h == "range": + b = mx - mn + elif h == "mid_range": + b = (mx + mn) * 0.5 - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) + b = np.ma.asanyarray(b) + + e = getattr(d, h)(axes=axes, squeeze=True) + + # For debugging + # msg = (f"{h}, axis={axes}, \ne.mask={e.mask.array}, " + # f "\nb.mask={b.mask}") + + self.assertTrue((e.mask.array == b.mask).all(), msg) + + # For debugging + # msg = (f"{h}, axis={axes}, unweighted, masked " + # f"\ne={e.array}, \nb={b}") + + self.assertTrue( + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg + ) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute 'w' for DataTest") def test_Data_integral(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for pp in (True, False): - # unmasked - d = cf.Data(self.a) - x = cf.Data(self.w) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - v = reshape_array(self.w, axes) - b = np.sum(b * v, axis=-1) + msg = None - e = d.integral( - axes=axes, squeeze=True, weights=x, _preserve_partitions=pp - ) + # unmasked + d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) + x = cf.Data(self.w, "kg") + for axes in self.axes_combinations: + b = reshape_array(self.a, axes) + v = reshape_array(self.w, axes) + b = np.sum(b * v, axis=-1) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unmasked \ne={}, \nb={}".format( - axes, e.array, b - ), - ) + e = d.integral(axes=axes, squeeze=True, weights=x) + self.assertTrue(e.Units, cf.Units("m kg")) - # masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - v = reshape_array(self.w, axes) - b = np.sum(b * v, axis=-1) - b = np.ma.asanyarray(b) + # For debugging + # msg = f"axis={axes}, masked \ne={e.array}, \nb={b}" - e = d.integral( - axes=axes, squeeze=True, weights=x, _preserve_partitions=pp - ) + self.assertTrue( + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg + ) - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={} masked, \ne.mask={}, " - "\nb.mask={}".format(axes, e.mask.array, b.mask), - ) + # masked + d = cf.Data(self.ma, "m", chunks=(2, 3, 2, 5)) + for axes in self.axes_combinations: + b = reshape_array(self.ma, axes) + v = reshape_array(self.w, axes) + b = np.sum(b * v, axis=-1) + b = np.ma.asanyarray(b) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, masked \ne={}, \nb={}".format(axes, e.array, b), - ) + e = d.integral(axes=axes, squeeze=True, weights=x) + self.assertTrue(e.Units, cf.Units("m kg")) + + # For debugging + # msg = f"axis={axes}, masked \ne={e.mask.array}, \nb={b}" + + self.assertTrue((e.mask.array == b.mask).all(), msg) + + # For debugging + # msg = f"axis={axes}, masked \ne={e.array}, \nb={b}" + + self.assertTrue( + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg + ) @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_sum_of_weights_sum_of_weights2(self): @@ -3004,11 +3071,12 @@ def test_Data_sum_of_weights_sum_of_weights2(self): ), ) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_mean_mean_absolute_value(self): + def test_Data_sum_mean_mean_absolute_value(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return + msg = None + for absolute in (False, True): a = self.a ma = self.ma @@ -3019,18 +3087,19 @@ def test_Data_mean_mean_absolute_value(self): method = "mean_absolute_value" # unweighted, unmasked - d = cf.Data(self.a) + d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) for axes in self.axes_combinations: b = reshape_array(a, axes) b = np.mean(b, axis=-1) e = getattr(d, method)(axes=axes, squeeze=True) + # For debugging + # msg = (f"{method} unweighted, unmasked, axis={axes}, " + # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") + self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} axis={}, unweighted, unmasked \ne={}, " - "\nb={}".format(method, axes, e.array, b), + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg ) - # --- End: for # weighted, unmasked x = cf.Data(self.w) @@ -3041,15 +3110,16 @@ def test_Data_mean_mean_absolute_value(self): e = getattr(d, method)(axes=axes, weights=x, squeeze=True) + # For debugging + # msg = (f"{method} weighted, unmasked, axis={axes}, " + # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") + self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} weighted, unmasked axis={}, \ne={}, " - "\nb={}".format(method, axes, e.array, b), + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg ) - # --- End: for # unweighted, masked - d = cf.Data(self.ma) + d = cf.Data(self.ma, "m", chunks=(2, 3, 2, 5)) for axes in self.axes_combinations: b = reshape_array(ma, axes) b = np.ma.average(b, axis=-1) @@ -3057,20 +3127,23 @@ def test_Data_mean_mean_absolute_value(self): e = getattr(d, method)(axes=axes, squeeze=True) + # For debugging + # msg = (f"{method} unweighted, masked, axis={axes}, " + # f"\ne.mask={e.mask.array}, \nb={b}") + + self.assertTrue((e.mask.array == b.mask).all(), msg) + + # For debugging + # msg = (f"{method} unweighted, masked, axis={axes}, " + # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") + self.assertTrue( - (e.mask.array == b.mask).all(), - "{} unweighted, masked axis={}, \ne.mask={}, " - "\nb.mask={}".format(method, axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} unweighted, masked axis={}, \ne={}, " - "\nb={}, ".format(method, axes, e.array, b), + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg ) - # --- End: for # weighted, masked for axes in self.axes_combinations: + print(axes) b = reshape_array(ma, axes) v = reshape_array(self.mw, axes) b = np.ma.average(b, axis=-1, weights=v) @@ -3078,18 +3151,21 @@ def test_Data_mean_mean_absolute_value(self): e = getattr(d, method)(axes=axes, weights=x, squeeze=True) - self.assertTrue( - (e.mask.array == b.mask).all(), - "{} weighted, masked axis={}, \ne.mask={}, " - "\nb.mask={}".format(method, axes, e.mask.array, b.mask), + # For debugging + # msg = (f"{method} weighted, masked, axis={axes}, " + # f"\ne.mask={e.mask.array}, \nb={b}") + + self.assertTrue((e.mask.array == b.mask).all(), msg) + + # For debugging + msg = ( + f"{method} weighted, masked, axis={axes}, " + f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}" ) self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} weighted, masked axis={}, \ne={}, " - "\nb={}, ".format(method, axes, e.array, b), + np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg ) - # --- End: for @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_root_mean_square(self): From d40544bd29d874d71ccfcf032a6851738db05435 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 14 Mar 2022 20:12:50 +0000 Subject: [PATCH 06/37] dev --- cf/data/data.py | 523 +++++++++++--------------------------- cf/data/utils.py | 67 +++++ cf/docstring/docstring.py | 85 +++++++ 3 files changed, 301 insertions(+), 374 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 1c3fbbaf6f..9c85050a53 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -71,6 +71,7 @@ from .utils import ( # is_small,; is_very_small, YMDhms, _is_numeric_dtype, + collapse, conform_units, convert_to_datetime, convert_to_reftime, @@ -6919,6 +6920,32 @@ def set_units(self, value): """ self.Units = Units(value, self.get_calendar(default=None)) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def max( + self, + axes=None, + squeeze=False, + mtol=1, + split_every=None, + inplace=False, + i=False, + ): + from .collapse_functions import cf_max + + d = _inplace_enabled_define_and_cleanup(self) + d, _ = collapse( + cf_maxn, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, + mtol=mtol, + ) + + return d + @_deprecated_kwarg_check("i") def maximum( self, @@ -6930,7 +6957,9 @@ def maximum( i=False, _preserve_partitions=False, ): - """Collapse axes with their maximum. + """Alias. + + Collapse axes with their maximum. Missing data array elements are omitted from the calculation. @@ -6960,6 +6989,8 @@ def maximum( inplace=inplace, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def maximum_absolute_value( self, axes=None, @@ -6967,7 +6998,6 @@ def maximum_absolute_value( mtol=1, split_every=None, inplace=False, - _preserve_partitions=False, ): """Collapse axes with their maximum absolute value. @@ -7003,20 +7033,43 @@ def maximum_absolute_value( """ - from .collapse_functions import cf_max_abs as collapse + from .collapse_functions import cf_max_abs d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, + d, _ = collapse( + cf_max_abs, + d, axis=axes, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - d._set_dask(dx, reset_mask_hardness=True) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def min( + self, + axes=None, + squeeze=False, + mtol=1, + inplace=False, + split_every=None, + i=False, + _preserve_partitions=False, + ): + from .collapse_functions import cf_min + + d = _inplace_enabled_define_and_cleanup(self) + d, _ = collapse( + cf_min, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, + mtol=mtol, + ) return d @_deprecated_kwarg_check("i") @@ -7029,7 +7082,9 @@ def minimum( i=False, _preserve_partitions=False, ): - """Collapse axes with their minimum. + """Alias. + + Collapse axes with their minimum. Missing data array elements are omitted from the calculation. @@ -7061,13 +7116,14 @@ def minimum( inplace=inplace, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def minimum_absolute_value( self, axes=None, squeeze=False, mtol=1, inplace=False, - _preserve_partitions=False, ): """Collapse axes with their minimum absolute value. @@ -7078,12 +7134,6 @@ def minimum_absolute_value( :Parameters: - axes : (sequence of) int, optional - - squeeze : bool, optional - - {{inplace: `bool`, optional}} - :Returns: `Data` or `None` @@ -7095,7 +7145,7 @@ def minimum_absolute_value( >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') >>> d.minimum_absolute_value() - >>> d.d.min() + >>> d.min() >>> d.minimum_absolute_value(axes=1) @@ -7103,20 +7153,17 @@ def minimum_absolute_value( """ - from .collapse_functions import cf_min_abs as collapse + from .collapse_functions import cf_min_abs d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, + d, _ = collapse( + cf_min_abs, + d, axis=axes, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - d._set_dask(dx, reset_mask_hardness=True) - return d @daskified(_DASKIFIED_VERBOSE) @@ -7132,179 +7179,31 @@ def mean( split_every=None, i=False, ): - from .collapse_functions import cf_mean as collapse + """Collapse axes with their mean. - d = _inplace_enabled_define_and_cleanup(self) + The mean is unweighted by default, but may be weighted (see the + *weights* parameter). - dx = d._get_dask() - dx = collapse( - dx, + Missing data array elements and their corresponding weights + are omitted from the calculation. + + """ + from .collapse_functions import cf_mean + + d = _inplace_enabled_define_and_cleanup(self) + d, _ = collapse( + cf_mean, + d, axis=axes, weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - d._set_dask(dx, reset_mask_hardness=True) - return d - # """Collapse axes with their mean. - # - # The mean is unweighted by default, but may be weighted (see the - # *weights* parameter). - # - # Missing data array elements and their corresponding weights - # are omitted from the calculation. - # - # :Parameters: - # - # axes: (sequence of) int, optional - # The axes to be collapsed. By default flattened input is - # used. Each axis is identified by its integer position. No - # axes are collapsed if *axes* is an empty sequence. - # - # squeeze: `bool`, optional - # If True then collapsed axes are removed. By default the - # axes which are collapsed are left in the result as axes - # with size 1, meaning that the result is guaranteed to - # broadcast correctly against the original array. - # - # weights: data-like or dict, optional - # Weights associated with values of the array. By default - # all non-missing elements of the array are assumed to have - # a weight equal to one. If *weights* is a data-like object - # then it must have either the same shape as the array or, - # if that is not the case, the same shape as the axes being - # collapsed. If *weights* is a dictionary then each key is - # axes of the array (an int or tuple of ints) with a - # corresponding data-like value of weights for those - # axes. In this case, the implied weights array is the outer - # product of the dictionary's values. - # - # *Parameter example:* - # If ``weights={1: w, (2, 0): x}`` then ``w`` must contain - # 1-dimensional weights for axis 1 and ``x`` must contain - # 2-dimensional weights for axes 2 and 0. This is - # equivalent, for example, to ``weights={(1, 2, 0), y}``, - # where ``y`` is the outer product of ``w`` and ``x``. If - # ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is - # equivalent to ``weights=y``. If ``axes=None`` and the - # array is 3-dimensional then ``weights={(1, 2, 0), y}`` - # is equivalent to ``weights=y.transpose([2, 0, 1])``. - # - # mtol: number, optional - # - # {{inplace: `bool`, optional}} - # - # {{i: deprecated at version 3.0.0}} - # - # :Returns: - # - # `Data` or `None` - # The collapsed array. - # - # .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, - # `var` - # - # **Examples:** - # - # >>> d = cf.Data([[1, 2, 4], [1, 4, 9]], 'm') - # >>> print(d.array) - # [[1 2 4] - # [1 4 9]] - # - # >>> d.mean() - # - # >>> d.mean(squeeze=True) - # - # >>> d.mean(axes=[0, 1]) - # - # >>> d.mean(axes=[1, 0]) - # - # >>> print(d.mean(axes=0).array) - # [[1. 3. 6.5]] - # >>> print(d.mean(axes=1).array) - # [[2.33333333] - # [4.66666667]] - # >>> d.mean(axes=1, squeeze=True) - # - # - # >>> y = cf.Data([1, 3]) - # >>> x = cf.Data([1, 2, 1]) - # >>> w = cf.Data.insert_dimension(y, 1) * x - # >>> print(w.array) - # [[1 2 1] - # [3 6 3]] - # - # >>> d.mean(weights=w) - # - # >>> d.mean(weights={(0, 1): w}) - # - # >>> d.mean(axes=[0, 1], weights={(0, 1): w}) - # - # >>> d.mean(axes=[1, 0], weights={(0, 1): w}) - # - # >>> d.mean(axes=(0, 1), weights={1: x, 0: y}) - # - # - # >>> d.mean(axes=1, weights=w) - # - # >>> d.mean(axes=1, weights=x) - # - # >>> d.mean(axes=1, weights={1: x}) - # - # >>> d.mean(axes=1, weights={(0, 1): w}) - # - # >>> d.mean(axes=1, weights={0: y, (1,): x}) - # - # - # >>> d.mean(axes=1) - # - # >>> d.mean(axes=1, weights={0: y}) - # - # - # >>> e = cf.Data(numpy.arange(24).reshape(3, 2, 4)) - # >>> print(e.array) - # [[[ 0 1 2 3] - # [ 4 5 6 7]] - # [[ 8 9 10 11] - # [12 13 14 15]] - # [[16 17 18 19] - # [20 21 22 23]]] - # - # >>> e.mean(axes=[0, 2]) - # - # >>> f = e.mean(axes=[0, 2], squeeze=True) - # >>> f - # - # >>> f.shape - # (2,) - # >>> print(e.mean(axes=[0, 1]).array) - # [[[10. 11. 12. 13.]]] - # >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) - # [[[11. 12. 13. 14.]]] - # - # >>> e[0, 0] = cf.masked - # >>> e[-1, -1] = cf.masked - # >>> e[..., 2] = cf.masked - # >>> print(e.array) - # [[[-- -- -- --] - # [4 5 -- 7]] - # [[8 9 -- 11] - # [12 13 -- 15]] - # [[16 17 -- 19] - # [-- -- -- --]]] - # - # >>> e.mean() - # - # >>> print(e.mean(axes=[0, 1]).array) - # [[[10.0 11.0 -- 13.0]]] - # >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) - # [[[9.666666666666666 10.666666666666666 -- 12.666666666666666]]] - # - # """ - + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def mean_absolute_value( self, axes=None, @@ -7324,12 +7223,6 @@ def mean_absolute_value( :Parameters: - axes : (sequence of) int, optional - - weights: - - squeeze : bool, optional - {{inplace: `bool`, optional}} :Returns: @@ -7347,21 +7240,18 @@ def mean_absolute_value( """ - from .collapse_functions import cf_mean_abs as collapse + from .collapse_functions import cf_mean_abs d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, + d, _ = collapse( + ccf_mean_abs, + d, axis=axes, weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - d._set_dask(dx, reset_mask_hardness=True) - return d @daskified(_DASKIFIED_VERBOSE) @@ -7384,28 +7274,26 @@ def integral( :Parameters: - axes: (sequence of) int, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + {{collapse axes: (sequence of) int, optional}} - squeeze: `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1, meaning that the result is guaranteed to - broadcast correctly against the original array. + {{collapse squeeze: `bool`, optional}} - weights: data-like or dict, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - a weight equal to one. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values. + weights: data_like or dict, optional + + Weights associated with values of the array. By + default all non-missing elements of the array are + assumed to have a weight equal to one. If *weights* is + a data_like object then it must have either the same + shape as the array or, if that is not the case, the + same shape as the axes being collapsed. + + + + If *weights* is a dictionary then each key specifies + axes of the array (an `int` or `tuple` of `int`), with + a corresponding value of data_like weights for those + axes. In this case, the implied weights array is the + outer product of the dictionary's values. Note that the units of the weights matter for an integral collapse, which differs from a weighted sum in that the units @@ -7438,14 +7326,17 @@ def integral( **Examples:** """ + from .collapse_functions import cf_sum + d = _inplace_enabled_define_and_cleanup(self) - d.sum( - axes=axes, + d, weights = collapse( + cf_sum, + d, + axis=axes, weights=weights, - squeeze=squeeze, - mtol=mtol, + keepdims=not squeeze, split_every=split_every, - inplace=True, + mtol=mtol, ) new_units = None @@ -7475,22 +7366,17 @@ def sample_size( inplace=False, i=False, ): - from .collapse_functions import cf_sample_size as collapse + from .collapse_functions import cf_sample_size d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, + d, _ = collapse( + cf_sample_size, + d, axis=axes, - weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - d._set_dask(dx, reset_mask_hardness=True) - - return d @property def binary_mask(self): @@ -12063,23 +11949,19 @@ def sum( inplace=False, split_every=None, i=False, - _preserve_partitions=False, ): - from .collapse_functions import cf_sum as collapse + from .collapse_functions import cf_sum d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, + d, _ = collapse( + cf_sum, + d, axis=axes, weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - d._set_dask(dx, reset_mask_hardness=True) - return d @daskified(_DASKIFIED_VERBOSE) @@ -12125,22 +12007,20 @@ def sum_of_squares( """ - from .collapse_functions import cf_sum_of_squares as collapse + from .collapse_functions import cf_sum_of_squares d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, + d, _ = collapse( + cf_sum_of_squares, + d, axis=axes, weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - d._set_dask(dx, reset_mask_hardness=True) - units = self.Units + units = d.Units if units: d.override_units(units ** 2, inplace=True) @@ -12190,28 +12070,18 @@ def sum_of_weights( **Examples:** """ - from .collapse_functions import cf_sum_of_weights as collapse + from .collapse_functions import cf_sum_of_weights d = _inplace_enabled_define_and_cleanup(self) - - if weights is None: - units = _units_None - else: - units = getattr(weights, "Units", None) - if units is None: - units = _units_None - - dx = d._get_dask() - dx = collapse( - dx, + d, weights = collapse( + cf_sum_of_weights, + d, axis=axes, weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - dx = da.sqrt(dx) - d._set_dask(dx, reset_mask_hardness=True) units = _units_None if weights is not None: @@ -12262,21 +12132,18 @@ def sum_of_weights2( **Examples:** """ - from .collapse_functions import cf_sum_of_weights2 as collapse + from .collapse_functions import cf_sum_of_weights2 d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, + d, weights = collapse( + cf_sum_of_weights2, + d, axis=axes, weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - dx = da.sqrt(dx) - d._set_dask(dx, reset_mask_hardness=True) units = _units_None if weights is not None: @@ -12297,7 +12164,8 @@ def sd( squeeze=False, mtol=1, weights=None, - ddof=0, + ddof=0, # TODASK: Is this the right default? + split_every=None, inplace=False, i=False, _preserve_partitions=False, @@ -12446,12 +12314,13 @@ def sd( d.var( axes=axes, weights=weights, + squeeze=squeeze, mtol=mtol, + ddof=ddof, split_every=split_every, inplace=True, ) - d **= 0.5 - return d + return d ** 0.5 # TODODASK: replace with sqrt @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @@ -12470,20 +12339,17 @@ def var( ): from .collapse_functions import cf_var - collapse = partial(cf_var, ddof=ddof) - d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, + d, _ = collapse( + partial(cf_var, ddof=ddof), + d, axis=axes, weights=weights, keepdims=not squeeze, - split_every=split_every, mtol=mtol, + split_every=split_every, + ddof=ddof, ) - d._set_dask(dx, reset_mask_hardness=True) units = d.Units if units: @@ -12548,36 +12414,6 @@ def section( self, axes, data=True, stop=stop, chunks=chunks, min_step=min_step ) - @daskified(_DASKIFIED_VERBOSE) - @_inplace_enabled(default=False) - @_deprecated_kwarg_check("i") - def sum( - self, - axes=None, - weights=None, - squeeze=False, - mtol=1, - inplace=False, - split_every=None, - i=False, - ): - from .collapse_functions import cf_sum as collapse - - d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, - axis=axes, - weights=weights, - keepdims=not squeeze, - split_every=split_every, - mtol=mtol, - ) - d._set_dask(dx, reset_mask_hardness=True) - - return d - # ---------------------------------------------------------------- # Alias # ---------------------------------------------------------------- @@ -12586,63 +12422,6 @@ def dtarray(self): """Alias for `datetime_array`""" return self.datetime_array - @daskified(_DASKIFIED_VERBOSE) - @_inplace_enabled(default=False) - @_deprecated_kwarg_check("i") - def max( - self, - axes=None, - squeeze=False, - mtol=1, - split_every=None, - inplace=False, - i=False, - ): - from .collapse_functions import cf_max as collapse - - d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, - axis=axes, - keepdims=not squeeze, - split_every=split_every, - mtol=mtol, - ) - d._set_dask(dx, reset_mask_hardness=True) - - return d - - @daskified(_DASKIFIED_VERBOSE) - @_inplace_enabled(default=False) - @_deprecated_kwarg_check("i") - def min( - self, - axes=None, - squeeze=False, - mtol=1, - inplace=False, - split_every=None, - i=False, - _preserve_partitions=False, - ): - from .collapse_functions import cf_min as collapse - - d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, - axis=axes, - keepdims=not squeeze, - split_every=split_every, - mtol=mtol, - ) - d._set_dask(dx, reset_mask_hardness=True) - - return d - def standard_deviation( self, axes=None, @@ -12652,7 +12431,6 @@ def standard_deviation( ddof=0, inplace=False, i=False, - _preserve_partitions=False, ): """Alias for `sd`""" return self.sd( @@ -12662,6 +12440,7 @@ def standard_deviation( mtol=mtol, ddof=ddof, inplace=inplace, + i=i, ) def variance( @@ -12673,7 +12452,6 @@ def variance( ddof=0, inplace=False, i=False, - _preserve_partitions=False, ): """Alias for `var`""" return self.var( @@ -12683,13 +12461,10 @@ def variance( mtol=mtol, ddof=ddof, inplace=inplace, - _preserve_partitions=_preserve_partitions, + i=i, ) -# --- End: class - - def _size_of_index(index, size=None): """Return the number of elements resulting in applying an index to a sequence. diff --git a/cf/data/utils.py b/cf/data/utils.py index cda7f77c6e..42e975bf99 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -574,3 +574,70 @@ def YMDhms(d, attr): d._map_blocks(partial(cf_YMDhms, attr=attr), dtype=int) d.override_units(Units(None), inplace=True) return d + + +def process_weights(d, weights, axis): + """TODODASK.""" + if not isinstance(weights, dict): + return weights + + if not w: + # No weights + return + + weights = weights.copy() + weights_axes = set() + for key, value in tuple(weights.items()): + key = d._parse_axes(key) + if weights_axes.intersection(key): + raise ValueError("Duplicate weights axis") + + weights_axes.update(key) + weights[tuple(key)] = weights.pop(key) + + if not weights_axes.intersection(axis): + # No weights span collapse axes + return + + # Add missing dimensions to each component as size 1 + w = [] + shape = d.shape + for key, value in weights.items(): + new_shape = [n if i in key else 1 for i, n in enumerate(shape)] + w.append(value.reshape(new_shape)) + + # Return the product of the weights components + return reduce(mul, w) + + +def collapse( + func, + d, + axis=None, + weights=None, + keepdims=True, + mtol=1, + split_every=None, + ddof=None, +): + """TODODASK.""" + kwargs = { + "axis": axis, + "keepdims": keepdims, + "split_every": split_every, + "mtol": mtol, + } + + if weights is not None: + weights = process_weights(d, weights, axis) + if weights is not None: + kwargs["weights"] = weights + + if ddof is not None: + kwargs["ddof"] = ddof + + dx = d.to_dask_array() + dx = func(dx, **kwargs) + d._set_dask(dx, reset_mask_hardness=True) + + return d, weights diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 8c1f69ce05..f3070edd49 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -246,6 +246,91 @@ domain axis. If the vertical axis does not appear in the computed non-parametric coodinates then this an empty tuple.""", + # collapse axes + "{{collapse axes: (sequence of) int, optional}}": """axes: (sequence of) int, optional + The axes to be collapsed. By default flattened input + is used. Each axis is identified by its integer + position. No axes are collapsed if *axes* is an empty + sequence. TODODASK - is the axes=() behaviour + correct??""", + # collapse axes + "{{collapse squeeze: `bool`, optional}}": """squeeze: `bool`, optional + By default, the axes which are collapsed are left in + the result as dimensions with size one, so that the + result will broadcast correctly against the input + array. If set to True then collapsed axes are removed + from the data.""", + # collapse weights + "{{collapse weights: optional}}}": """weights: optional + Weights associated with values of the array. By + default all non-missing elements of the array are + assumed to have a weight equal to one. + + If *weights* is a data_like object then it must be + broadcastable to the array or, if that is not the + case, the same shape as the axes being + collapsed. TODODASK - scrib the last possibility? + + If *weights* is a dictionary then each key specifies + axes of the array (an `int` or `tuple` of `int`), with + a corresponding value of data_like weights for those + axes. In this case, the implied weights array is the + outer product of the dictionary's values. + + + Specify the weights for the collapse axes. The weights + are, in general, those that would be returned by this + call of the field construct's `weights` method: + ``f.weights(weights, axes=axes, measure=measure, + scale=scale, radius=radius, great_circle=great_circle, + components=True)``. See the *axes*, *measure*, + *scale*, *radius* and *great_circle* parameters and + `cf.Field.weights` for details. + + .. note:: By default *weights* is `None`, resulting in + **unweighted calculations**. + + If the alternative form of providing the collapse method + and axes combined as a CF cell methods-like string via the + *method* parameter has been used, then the *axes* + parameter is ignored and the axes are derived from the + *method* parameter. For example, if *method* is ``'T: + area: minimum'`` then this defines axes of ``['T', + 'area']``. If *method* specifies multiple collapses, + e.g. ``'T: minimum area: mean'`` then this implies axes of + ``'T'`` for the first collapse, and axes of ``'area'`` for + the second collapse. + + .. note:: Setting *weights* to `True` is generally a good + way to ensure that all collapses are + appropriately weighted according to the field + construct's metadata. In this case, if it is not + possible to create weights for any axis then an + exception will be raised. + + However, care needs to be taken if *weights* is + `True` when cell volume weights are desired. The + volume weights will be taken from a "volume" + cell measure construct if one exists, otherwise + the cell volumes will be calculated as being + proportional to the sizes of one-dimensional + vertical coordinate cells. In the latter case + **if the vertical dimension coordinates do not + define the actual height or depth thickness of + every cell in the domain then the weights will + be incorrect**. + + *Parameter example:* + To specify weights based on the field construct's + metadata for all collapse axes use ``weights=True``. + + *Parameter example:* + To specify weights based on cell areas use + ``weights='area'``. + + *Parameter example:* + To specify weights based on cell areas and linearly in + time you could set ``weights=('area', 'T')``.""", # ---------------------------------------------------------------- # Method description susbstitutions (4 levels of indentataion) # ---------------------------------------------------------------- From f6ffd6d47acb531ee3f7e4248acd8d51f3b5fd2d Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 14 Mar 2022 23:06:08 +0000 Subject: [PATCH 07/37] dev --- cf/data/utils.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/cf/data/utils.py b/cf/data/utils.py index 42e975bf99..61c0a5c92a 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -576,37 +576,52 @@ def YMDhms(d, attr): return d -def process_weights(d, weights, axis): +def format_weights(d, weights, axis=None): """TODODASK.""" if not isinstance(weights, dict): return weights - if not w: + if not weights: # No weights return + ndim = d.ndim + if axis is None: + axis = tuple(range(d.ndim)) + else: + axis = d._parse_axes(axis) + weights = weights.copy() weights_axes = set() for key, value in tuple(weights.items()): - key = d._parse_axes(key) + del weights[key] + key = tuple(d._parse_axes(key)) if weights_axes.intersection(key): raise ValueError("Duplicate weights axis") + + if value.ndim > ndim: TODO: test weights .shpae against impled eaxes shape + raise ValueError( + f"Weights component for axes {key} with shape " + f"{weights.shape} is not broadcastable to data with " + f"shape {d.shape}" + ) + weights[key] = value weights_axes.update(key) - weights[tuple(key)] = weights.pop(key) if not weights_axes.intersection(axis): # No weights span collapse axes return - # Add missing dimensions to each component as size 1 + # For each componente, add missing dimensions as size 1. w = [] shape = d.shape for key, value in weights.items(): new_shape = [n if i in key else 1 for i, n in enumerate(shape)] w.append(value.reshape(new_shape)) - # Return the product of the weights components + # Return the product of the weights components, which will be + # broadcastable to d return reduce(mul, w) @@ -629,7 +644,7 @@ def collapse( } if weights is not None: - weights = process_weights(d, weights, axis) + weights = format_weights(d, weights, axis) if weights is not None: kwargs["weights"] = weights From 1e4389e454c5d6c433fc88cd1b511692e7e95b18 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 15 Mar 2022 22:48:50 +0000 Subject: [PATCH 08/37] dev --- cf/data/collapse_functions.py | 1904 ++++++++++++++++++++++----------- cf/data/data.py | 722 +++++++++---- cf/data/utils.py | 82 -- cf/docstring/docstring.py | 145 +-- cf/test/test_Data.py | 688 +++++++----- 5 files changed, 2302 insertions(+), 1239 deletions(-) diff --git a/cf/data/collapse_functions.py b/cf/data/collapse_functions.py index aa4d4a27d0..3985a9c528 100644 --- a/cf/data/collapse_functions.py +++ b/cf/data/collapse_functions.py @@ -2,6 +2,7 @@ from operator import mul import numpy as np +from cfdm.core import DocstringRewriteMeta from dask.array import chunk from dask.array.core import _concatenate2 from dask.array.reductions import divide, numel, reduction @@ -9,10 +10,736 @@ from dask.utils import deepmap # Apply function inside nested lists +class Collapse(metaclass=DocstringRewriteMeta): + """Container for functions that collapse `dask` arrays. + + .. versionadded:: TODODASK + + """ + + @staticmethod + def max(a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return maximum values of an array. + + Calculates the maximum value of an array or the maximum values + along axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = a.dtype + return reduction( + a, + cf_max_chunk, + partial(cf_max_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_max_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @staticmethod + def max_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return maximum absolute values of an array. + + Calculates the maximum absolute value of an array or the + maximum absolute values along axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = a.dtype + return reduction( + a, + cf_max_abs_chunk, + partial(cf_max_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_max_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @staticmethod + def mean( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + ): + """Return mean values of an array. + + Calculates the mean value of an array or the mean values along + axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = "f8" + return reduction( + a, + cf_mean_chunk, + partial(cf_mean_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_mean_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @staticmethod + def mean_abs( + a, weights=None, axis=None, keepdims=False, mtol=None, split_every=None + ): + """"Return mean absolute values of an array. + + Calculates the mean absolute value of an array or the mean + absolute values along axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = "f8" + return reduction( + a, + cf_mean_abs_chunk, + partial(cf_mean_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_mean_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @staticmethod + def mid_range( + a, axis=None, dtype=None, keepdims=False, mtol=None, split_every=None + ): + """Return mid-range values of an array. + + Calculates the mid-range value of an array or the mid-range + values along axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = "f8" + return reduction( + a, + cf_range_chunk, + partial(cf_mid_range_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_range_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @staticmethod + def min(a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return minimum values of an array. + + Calculates the minimum value of an array or the minimum values + along axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = a.dtype + return reduction( + a, + cf_min_chunk, + partial(cf_min_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_min_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @staticmethod + def min_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return minimum absolute values of an array. + + Calculates the minimum absolute value of an array or the + minimum absolute values along axes. + + .. versionadded:: TODODASK + + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = a.dtype + return reduction( + a, + cf_min_abs_chunk, + partial(cf_min_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_min_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @staticmethod + def range(a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return range values of an array. + + Calculates the range value of an array or the range values + along axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = a.dtype + return reduction( + a, + cf_range_chunk, + partial(cf_range_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_range_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @staticmethod + def rms( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + ): + """Return root mean square (RMS) values of an array. + + Calculates the RMS value of an array or the RMS values along + axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = "f8" + return reduction( + a, + cf_rms_chunk, + partial(cf_rms_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_mean_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @staticmethod + def sample_size(a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return sample size values of an array. + + Calculates the sample size value of an array or the sample + size values along axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = "i8" + return reduction( + a, + cf_sample_size_chunk, + partial(cf_sample_size_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sample_size_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @staticmethod + def sum( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + ): + """Return sum values of an array. + + Calculates the sum value of an array or the sum values along + axes. + + .. versionadded:: TODODASK + + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + if weights is None: + dtype = double_precision_dtype(a) + else: + dtype = "f8" + + return reduction( + a, + cf_sum_chunk, + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @staticmethod + def sum_of_squares( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + ): + """Return sum of square values of an array. + + Calculates the sum of square value of an array or the sum of + square values along axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + if weights is None: + dtype = double_precision_dtype(a) + else: + dtype = "f8" + + return reduction( + a, + partial(cf_sum_chunk, squared=True), + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @staticmethod + def sum_of_weights( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + ): + """Return sum of weights values for an array. + + Calculates the sum of weights value for an array or the sum of + weights values along axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = "f8" + return reduction( + a, + cf_sum_of_weights_chunk, + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @staticmethod + def sum_of_weights2( + a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + ): + """Return sum of squares of weights values for an array. + + Calculates the sum of squares of weights value for an array or + the sum of squares of weights values along axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = "f8" + return reduction( + a, + partial(cf_sum_of_weights_chunk, squared=True), + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @staticmethod + def var( + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + ddof=None, + split_every=None, + ): + """Return variances of an array. + + Calculates the variance value of an array or the variance + values along axes. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional} + + {{ddof: number}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + dtype = "f8" + return reduction( + a, + partial(cf_var_chunk, ddof=ddof), + partial(cf_var_agg, mtol=mtol, ddof=ddof, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_var_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + +def double_precision_dtype(a): + """Return the double precision data type of an aray. + + :Parameters: + + a: array_like + + :Returns: + + `str` + The double precision type. + + **Examples** + + >>> for d in (float, 'float32', int, 'int32'): + ... print(double_precision_dtype(np.array(9, dtype=d))) + ... + f8 + f8 + i8 + i8 + + """ + return a.dtype.kind + "8" + + def mask_small_sample_size(x, N, axis, mtol, original_shape): """Mask elements where the sample size of the collapsed data is below a threshold. + .. versionadded:: TODODASK + :Parameters: x: `numpy.ndarray` @@ -25,6 +752,21 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): The axes being collapsed. mtol: number + The sample size threshold below which collapsed values are + set to missing data. It is defined as a fraction (between + 0 and 1 inclusive) of the contributing input data values. + A missing datum in the output array occurs whenever at + least ``100*mtol%`` of its contributing input array + elements are non-missing data. The default of *mtol* is 1, + meaning that a missing datum in the output array only + occurs when all of its contributing input array elements + are missing data. A value of 0 means that a missing datum + in the output array occurs whenever any of its + contributing input array elements are missing. Any + intermediate value is allowed. Note that for non-zero + values of *mtol*, different collapsed elements may have + different sample sizes, depending on the distribution of + missing data in the input data. original_shape: `tuple` The shape of the original, uncollapsed data. @@ -33,7 +775,8 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): `numpy.ndarray` Array *x* masked where *N* is sufficiently small. Note - that input *x* may be modified in-place with the output. + that the input *x* might be modified in-place with the + contents of the output. """ if not x.ndim: @@ -42,22 +785,32 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): x = np.asanyarray(x) if mtol < 1: - # Nmax = total number of element, including an missing values + # Nmax = total number of elements, including missing values Nmax = reduce(mul, [original_shape[i] for i in axis], 1) x = np.ma.masked_where(N < (1 - mtol) * Nmax, x, copy=False) return x -def sum_of_weights( - x, weights=None, squared=False, dtype="f8", N=None, **kwargs -): - """TODO.""" +def sum_weights(x, weights=None, squared=False, dtype="f8", N=None, **kwargs): + """TODO. + + .. versionadded:: TODODASK + + :Parameters: + + squared: `bool`, optional + If True calculate the sum of the squares of the weights. + + """ if weights is None: + # All weights are 1, so the sum of the weights and the sum of + # the squares of the weights are both equal to the sample + # size. if N is None: - N = cf_sample_size_chunk(x, **kwargs)["N"] + return cf_sample_size_chunk(x, dtype=dtype, **kwargs)["N"] - return N + return N.astype(dtype) if squared: weights = np.multiply(weights, weights, dtype=dtype) @@ -71,14 +824,25 @@ def sum_of_weights( def combine_arrays( pairs, key, func, axis, dtype=None, computing_meta=False, **kwargs ): - # Create a nested list of N and recursively concatenate it - # along the specified + """Worker function for Combine functions. + + Select arrays by dictionary key from a nested list of + dictionaries, concatenate the resulting a nested list of arrays + along the axes, and apply a function to the result along the same + axes. + + :Returns: + + `numpy.ndarray` + + """ x = deepmap(lambda pair: pair[key], pairs) if not computing_meta else pairs if dtype: kwargs["dtype"] = dtype - return func(_concatenate2(x, axes=axis), axis=axis, **kwargs) + x = _concatenate2(x, axes=axis) + return func(x, axis=axis, **kwargs) def sum_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): @@ -89,7 +853,11 @@ def sum_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): def max_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): - """Alias of `combine_arrays` with ``func=chunk.max``.""" + """Alias of `combine_arrays` with ``func=chunk.max``. + + .. versionadded:: TODODASK + + """ return combine_arrays( pairs, key, chunk.max, axis, dtype, computing_meta, **kwargs ) @@ -104,7 +872,11 @@ def min_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): def sum_sample_sizes(pairs, axis, **kwargs): """Alias of `combine_arrays` with ``key="N", func=chunk.sum, - dtype="i8", computing_meta=False``.""" + dtype="i8", computing_meta=False``. + + .. versionadded:: TODODASK + + """ return combine_arrays( pairs, "N", chunk.sum, axis, dtype="i8", computing_meta=False, **kwargs ) @@ -114,36 +886,24 @@ def sum_sample_sizes(pairs, axis, **kwargs): # mean # -------------------------------------------------------------------- def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): - """Return chunk-based values for calculating the global mean. - - :Parameters: + """Chunk calculations for the mean. - x: numpy.ndarray - Chunks data being reduced along one or more axes. + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. + .. versionadded:: TODODASK - dtype: data_type, optional - Data type of global reduction. - - computing_meta: `bool` optional - See `dask.array.reductions` for details. + :Parameters: - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for details. :Returns: `dict` Dictionary with the keys: - * N: The sample size. - - * V1: The sum of ``weights`` (set to ``N`` if weights are - not present). - + * V1: The sum of ``weights`` (equal to ``N`` if weights + are not set). * sum: The weighted sum of ``x``. """ @@ -153,10 +913,7 @@ def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): # N, sum d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) - if weights is None: - d["V1"] = d["N"] - else: - d["V1"] = chunk.sum(weights, dtype=dtype, **kwargs) + d["V1"] = sum_weights(x, weights, N=d["N"], **kwargs) return d @@ -168,7 +925,19 @@ def cf_mean_combine( computing_meta=False, **kwargs, ): - """Apply the function to the data in a nested list of arrays.""" + """Combine calculations for the mean. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details. + + :Returns: + + As for `cf_mean_chunk`. + + """ if not isinstance(pairs, list): pairs = [pairs] @@ -191,9 +960,32 @@ def cf_mean_agg( original_shape=None, **kwargs, ): - """Apply the function to the data in a nested list of arrays and - mask where the sample size is below the threshold.""" - d = cf_mean_combine(pairs, axis, computing_meta, **kwargs) + """"Aggregate calculations for the mean. + + This function is passed to `dask.array.reduction` as callable + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for further details. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_mean_combine(pairs, axis, dtype, computing_meta, **kwargs) if computing_meta: return d @@ -202,64 +994,30 @@ def cf_mean_agg( return x -def cf_mean( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None -): - """TODODASK.""" - dtype = float - return reduction( - a, - cf_mean_chunk, - partial(cf_mean_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_mean_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - weights=weights, - ) - - # -------------------------------------------------------------------- # mean_absolute_value # -------------------------------------------------------------------- def cf_mean_abs_chunk( x, weights=None, dtype=None, computing_meta=False, **kwargs ): - """Return chunk-based values for calculating the global absolute - mean. - - :Parameters: - - x: numpy.ndarray - Chunks data being reduced along one or more axes. + """Chunk calculations for the mean of the absolute values. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - dtype: data_type, optional - Data type of global reduction. + .. versionadded:: TODODASK - computing_meta: `bool` optional - See `dask.array.reductions` for details. + :Parameters: - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for details. :Returns: `dict` Dictionary with the keys: - * N: The sample size. - - * V1: The sum of the weights (set to N if weights are not - present). - + * V1: The sum of ``weights`` (equal to ``N`` if weights + are not set). * sum: The weighted sum of ``abs(x)``. """ @@ -269,58 +1027,26 @@ def cf_mean_abs_chunk( return cf_mean_chunk(np.abs(x), weights, dtype=dtype, **kwargs) -def cf_mean_abs( - a, weights=None, axis=None, keepdims=False, mtol=None, split_every=None -): - """TODODASK.""" - dtype = a.dtype - return reduction( - a, - cf_mean_abs_chunk, - partial(cf_mean_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_mean_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - weights=weights, - ) - - # -------------------------------------------------------------------- # maximum # -------------------------------------------------------------------- def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Return chunk-based values for calculating the global maximum. - - :Parameters: - - x: numpy.ndarray - Chunks data being reduced along one or more axes. + """Chunk calculations for the maximum. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - dtype: data_type, optional - Data type of global reduction. Ignored. + .. versionadded:: TODODASK - computing_meta: `bool` optional - See `dask.array.reductions` for details. + :Parameters: - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for details. :Returns: `dict` Dictionary with the keys: - * N: The sample size. - * max: The maximum of `x``. """ @@ -339,7 +1065,19 @@ def cf_max_combine( computing_meta=False, **kwargs, ): - """Find the max and min of a nested list of arrays.""" + """Combine calculations for the maximum. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details. + + :Returns: + + As for `cf_max_chunk`. + + """ if not isinstance(pairs, list): pairs = [pairs] @@ -363,7 +1101,31 @@ def cf_max_agg( original_shape=None, **kwargs, ): - """Find the range of a nested list of arrays.""" + """Aggregate calculations for the maximum. + + This function is passed to `dask.array.reduction` as callable + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for further details. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ d = cf_max_combine(pairs, axis, computing_meta, **kwargs) if computing_meta: return d @@ -373,56 +1135,26 @@ def cf_max_agg( return x -def cf_max(a, axis=None, keepdims=False, mtol=None, split_every=None): - """TODODASK.""" - dtype = a.dtype - return reduction( - a, - cf_max_chunk, - partial(cf_max_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_max_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - ) - - # -------------------------------------------------------------------- # maximum_absolute_value # -------------------------------------------------------------------- def cf_max_abs_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Return chunk-based values for calculating the global absolute - max. - - :Parameters: + """Chunk calculations for the maximum of absolute values. - x: numpy.ndarray - Chunks data being reduced along one or more axes. + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. + .. versionadded:: TODODASK - dtype: data_type, optional - Data type of global reduction. - - computing_meta: `bool` optional - See `dask.array.reductions` for details. + :Parameters: - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for details. :Returns: `dict` Dictionary with the keys: - * N: The sample size. - * max: The maximum of ``abs(x)``. """ @@ -432,55 +1164,73 @@ def cf_max_abs_chunk(x, dtype=None, computing_meta=False, **kwargs): return cf_max_chunk(np.abs(x), **kwargs) -def cf_max_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): - """TODODASK.""" - dtype = a.dtype - return reduction( - a, - cf_max_abs_chunk, - partial(cf_max_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_max_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - ) +# -------------------------------------------------------------------- +# mid-range +# -------------------------------------------------------------------- +def cf_mid_range_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregate calculations for the mid-range. + + This function is passed to `dask.array.reduction` as callable + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for further details. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_range_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + # Calculate the mid-range + x = divide(d["max"] + d["min"], 2.0, dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x # -------------------------------------------------------------------- # minimum # -------------------------------------------------------------------- def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Return chunk-based values for calculating the global minimum. - - :Parameters: - - x: numpy.ndarray - Chunks data being reduced along one or more axes. + """Chunk calculations for the minimum. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - dtype: data_type, optional - Data type of global reduction. + .. versionadded:: TODODASK - computing_meta: `bool` optional - See `dask.array.reductions` for details. + :Parameters: - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for details. :Returns: `dict` Dictionary with the keys: - * N: The sample size. - * min: The minimum of ``x``. """ @@ -499,7 +1249,19 @@ def cf_min_combine( computing_meta=False, **kwargs, ): - """Find the max and min of a nested list of arrays.""" + """Combine calculations for the minimum. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details. + + :Returns: + + As for `cf_min_chunk`. + + """ if not isinstance(pairs, list): pairs = [pairs] @@ -523,7 +1285,31 @@ def cf_min_agg( original_shape=None, **kwargs, ): - """Find the range of a nested list of arrays.""" + """Aggregate calculations for the minimum. + + This function is passed to `dask.array.reduction` as callable + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for further details. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ d = cf_min_combine(pairs, axis, computing_meta, **kwargs) if computing_meta: return d @@ -533,56 +1319,26 @@ def cf_min_agg( return x -def cf_min(a, axis=None, keepdims=False, mtol=None, split_every=None): - """TODODASK.""" - dtype = a.dtype - return reduction( - a, - cf_min_chunk, - partial(cf_min_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_min_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - ) - - # -------------------------------------------------------------------- # minimum absolute value # -------------------------------------------------------------------- def cf_min_abs_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Return chunk-based values for calculating the global absolute - min. - - :Parameters: + """Chunk calculations for the minimum of absolute values. - x: numpy.ndarray - Chunks data being reduced along one or more axes. + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. + .. versionadded:: TODODASK - dtype: data_type, optional - Data type of global reduction. - - computing_meta: `bool` optional - See `dask.array.reductions` for details. + :Parameters: - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for details. :Returns: `dict` Dictionary with the keys: - * N: The sample size. - * min: The minimum of ``abs(x)``. """ @@ -592,57 +1348,27 @@ def cf_min_abs_chunk(x, dtype=None, computing_meta=False, **kwargs): return cf_min_chunk(np.abs(x), **kwargs) -def cf_min_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): - """TODODASK.""" - dtype = a.dtype - return reduction( - a, - cf_min_abs_chunk, - partial(cf_min_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_min_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - ) - - # -------------------------------------------------------------------- # range # -------------------------------------------------------------------- def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Return chunk-based values for calculating the global range. + """Chunk calculations for the range. - :Parameters: - - x: numpy.ndarray - Chunks data being reduced along one or more axes. + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. + .. versionadded:: TODODASK - dtype: data_type, optional - Data type of global reduction. - - computing_meta: `bool` optional - See `dask.array.reductions` for details. + :Parameters: - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for details. :Returns: `dict` Dictionary with the keys: - * N: The sample size. - * min: The minimum of ``x``. - * max: The maximum of ``x`. """ @@ -661,7 +1387,19 @@ def cf_range_combine( computing_meta=False, **kwargs, ): - """Find the max and min of a nested list of arrays.""" + """Combine calculations for the range. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details. + + :Returns: + + As for `cf_range_chunk`. + + """ if not isinstance(pairs, list): pairs = [pairs] @@ -688,113 +1426,61 @@ def cf_range_agg( original_shape=None, **kwargs, ): - """Find the range of a nested list of arrays.""" - d = cf_range_combine(pairs, axis, computing_meta, **kwargs) - if computing_meta: - return d + """Aggregate calculations for the range. - # Calculate the range - x = d["max"] - d["min"] - x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) - return x + This function is passed to `dask.array.reduction` as callable + *aggregate* parameter. + .. versionadded:: TODODASK -def cf_range(a, axis=None, keepdims=False, mtol=None, split_every=None): - """TODODASK.""" - dtype = a.dtype - return reduction( - a, - cf_range_chunk, - partial(cf_range_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_range_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - ) + :Parameters: + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. -# -------------------------------------------------------------------- -# mid-range -# -------------------------------------------------------------------- -cf_mid_range_chunk = cf_range_chunk -cf_mid_range_combine = cf_range_combine + original_shape: `tuple` + The shape of the original, uncollapsed data. + See `dask.array.reductions` for further details. -def cf_mid_range_agg( - pairs, - axis=None, - dtype="f8", - computing_meta=False, - mtol=None, - original_shape=None, - **kwargs, -): - """Find the mid-range of a nested list of arrays.""" - d = cf_range_combine(pairs, axis, dtype, computing_meta, **kwargs) + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_range_combine(pairs, axis, computing_meta, **kwargs) if computing_meta: return d - # Calculate the mid-range - x = divide(d["max"] + d["min"], 2.0, dtype=dtype) + # Calculate the range + x = d["max"] - d["min"] x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) return x -def cf_mid_range( - a, axis=None, dtype=float, keepdims=False, mtol=None, split_every=None -): - """TODODASK.""" - dtype = float - return reduction( - a, - cf_mid_range_chunk, - partial(cf_mid_range_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_mid_range_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - ) - - # -------------------------------------------------------------------- # root mean square # -------------------------------------------------------------------- def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): - """Return chunk-based values for calculating the global RMS. - - :Parameters: - - x: numpy.ndarray - Chunks data being reduced along one or more axes. + """Chunk calculations for the root mean square (RMS).. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - dtype: data_type, optional - Data type of global reduction. + .. versionadded:: TODODASK - computing_meta: `bool` optional - See `dask.array.reductions` for details. + :Parameters: - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for details. :Returns: `dict` Dictionary with the keys: - * N: The sample size. - * sum: The weighted sum of ``x**2``. """ @@ -815,67 +1501,59 @@ def cf_rms_agg( original_shape=None, **kwargs, ): - """Apply the function to the data in a nested list of arrays and - mask where the sample size is below the threshold.""" - d = cf_sum_combine(pairs, axis, computing_meta, **kwargs) + """Aggregate calculations for the root mean square (RMS). + + This function is passed to `dask.array.reduction` as callable + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for further details. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_mean_combine(pairs, axis, dtype, computing_meta, **kwargs) if computing_meta: return d - x = np.sqrt(d["sum"], dtype=dtype) + x = np.sqrt(d["sum"] / d["V1"], dtype=dtype) x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) return x -def cf_rms( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None -): - """TODODASK.""" - dtype = float - return reduction( - a, - cf_rms_chunk, - partial(cf_rms_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_sum_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - weights=weights, - ) - - # -------------------------------------------------------------------- -# sample_size +# sample size # -------------------------------------------------------------------- def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): - """Return chunk-based values for calculating the global sample size. - - :Parameters: - - x: numpy.ndarray - Chunks data being reduced along one or more axes. + """Chunk calculations for the sample size. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - dtype: data_type, optional - Data type of global reduction. + .. versionadded:: TODODASK - computing_meta: `bool` optional - See `dask.array.reductions` for details. + :Parameters: - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for details. :Returns: `dict` Dictionary with the keys: - * N: The sample size. """ @@ -896,13 +1574,27 @@ def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): def cf_sample_size_combine( pairs, axis=None, + dtype="i8", computing_meta=False, **kwargs, ): + """Combine calculations for the sample size. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details. + + :Returns: + + As for `cf_sample_size_chunk`. + + """ if not isinstance(pairs, list): pairs = [pairs] - x = sum_arrays(pairs, "N", axis, None, computing_meta, **kwargs) + x = sum_arrays(pairs, "N", axis, dtype, computing_meta, **kwargs) if computing_meta: return x @@ -913,11 +1605,37 @@ def cf_sample_size_agg( pairs, axis=None, computing_meta=False, + dtype="i8", mtol=None, original_shape=None, **kwargs, ): - d = cf_sample_size_combine(pairs, axis, computing_meta, **kwargs) + """Aggregate calculations for the sample size. + + This function is passed to `dask.array.reduction` as callable + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for further details. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_sample_size_combine(pairs, axis, dtype, computing_meta, **kwargs) if computing_meta: return d @@ -926,61 +1644,41 @@ def cf_sample_size_agg( return x -def cf_sample_size(a, axis=None, keepdims=False, mtol=None, split_every=None): - """TODODASK.""" - dtype = int - return reduction( - a, - cf_sample_size_chunk, - partial(cf_sample_size_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_sample_size_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - ) - - # -------------------------------------------------------------------- # sum # -------------------------------------------------------------------- -def cf_sum_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): - """Return chunk-based values for calculating the global sum. - - :Parameters: +def cf_sum_chunk( + x, weights=None, dtype="f8", computing_meta=False, squared=False, **kwargs +): + """Chunk calculations for the sum. - x: numpy.ndarray - Chunks data being reduced along one or more axes. + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. + .. versionadded:: TODODASK - dtype: data_type, optional - Data type of global reduction. + :Parameters: - computing_meta: `bool` optional - See `dask.array.reductions` for details. + squared: `bool`, optional + If True then calculate the weighted sum of the squares. - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for details. :Returns: `dict` Dictionary with the keys: - * N: The sample size. - - * sum: The weighted sum of ``x``. + * sum: The weighted sum of ``x``, or the weighted sum of + ``x**2`` if *squared* is True. """ if computing_meta: return x + if squared: + x = np.multiply(x, x, dtype=dtype) + if weights is not None: x = np.multiply(x, weights, dtype=dtype) @@ -996,7 +1694,19 @@ def cf_sum_combine( computing_meta=False, **kwargs, ): - """Apply the function to the data in a nested list of arrays.""" + """Combine calculations for the sum. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details. + + :Returns: + + As for `cf_sum_chunk`. + + """ if not isinstance(pairs, list): pairs = [pairs] @@ -1021,101 +1731,38 @@ def cf_sum_agg( original_shape=None, **kwargs, ): - """Apply the function to the data in a nested list of arrays and - mask where the sample size is below the threshold.""" - d = cf_sum_combine(pairs, axis, computing_meta, **kwargs) - if computing_meta: - return d - - x = d["sum"] - x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) - return x - - -def cf_sum( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None -): - """TODODASK.""" - dtype = float - return reduction( - a, - cf_sum_chunk, - partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_sum_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - weights=weights, - ) + """Aggregate calculations for the sum. + This function is passed to `dask.array.reduction` as callable + *aggregate* parameter. -# -------------------------------------------------------------------- -# sum of sqaures -# -------------------------------------------------------------------- -def cf_sum_of_squares_chunk( - x, weights=None, dtype="f8", computing_meta=False, **kwargs -): - """Return chunk-based values for calculating the global sum. + .. versionadded:: TODODASK :Parameters: - x: numpy.ndarray - Chunks data being reduced along one or more axes. + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. - - dtype: data_type, optional - Data type of global reduction. - - computing_meta: `bool` optional - See `dask.array.reductions` for details. + original_shape: `tuple` + The shape of the original, uncollapsed data. - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for further details. :Returns: - `dict` - Dictionary with the keys: - - * N: The sample size. - - * sum: The weighted sum of ``x**2`` + `dask.array.Array` + The collapsed array. """ + d = cf_sum_combine(pairs, axis, dtype, computing_meta, **kwargs) if computing_meta: - return x - - return cf_sum_chunk( - np.multiply(x, x, dtype=dtype), weights, dtype=dtype, **kwargs - ) - + return d -def cf_sum_of_squares( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None -): - """TODODASK.""" - dtype = float - return reduction( - a, - cf_sum_of_squares_chunk, - partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_sum_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - weights=weights, - ) + x = d["sum"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x # -------------------------------------------------------------------- @@ -1124,33 +1771,24 @@ def cf_sum_of_squares( def cf_sum_of_weights_chunk( x, weights=None, dtype="f8", computing_meta=False, squared=False, **kwargs ): - """Return chunk-based values for calculating the global sum. + """Chunk calculations for the sum of the weights. - :Parameters: - - x: numpy.ndarray - Chunks data being reduced along one or more axes. + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. - - dtype: data_type, optional - Data type of global reduction. + :Parameters: - computing_meta: `bool` optional - See `dask.array.reductions` for details. + squared: `bool`, optional + If True then calculate the sum of the squares of the + weights. - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for details. :Returns: `dict` Dictionary with the keys: - * N: The sample size. - * sum: The sum of ``weights``, or the sum of ``weights**2`` if *squared* is True. @@ -1162,91 +1800,37 @@ def cf_sum_of_weights_chunk( d = cf_sample_size_chunk(x, **kwargs) # sum - d["sum"] = sum_of_weights( - x, weights=weights, dtype=dtype, N=d["N"], squared=squared, **kwargs - ) + d["sum"] = sum_weights(x, weights=weights, squared=squared, + N=d["N"], **kwargs ) return d -def cf_sum_of_weights( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None -): - """TODODASK.""" - dtype = float - return reduction( - a, - cf_sum_of_weights_chunk, - partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_sum_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - weights=weights, - ) - - -def cf_sum_of_weights2( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None -): - """TODODASK.""" - dtype = float - return reduction( - a, - partial(cf_sum_of_weights_chunk, squared=True), - partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_sum_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - weights=weights, - ) - - # -------------------------------------------------------------------- # variance # -------------------------------------------------------------------- def cf_var_chunk( x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs ): - """Return chunk-based values for calculating the global variance. - - .. note:: If weights are provided then they are interpreted as - reliability weights, as opposed to frequency weights - (where a weight equals the number of occurrences). + """Chunk calculations for the variance. - :Parameters: + This function is passed to `dask.array.reduction` as callable + *chunk* parameter. - x: numpy.ndarray - Chunks data being reduced along one or more axes. + See + https://en.wikipedia.org/wiki/Pooled_variance#Sample-based_statistics + for details. - weights: numpy array-like, optional - Weights to be used in the reduction of *x*, with the same - shape as *x*. By default the reduction is unweighted. + .. versionadded:: TODODASK - dtype: data_type, optional - Data type of global reduction. + :Parameters: - ddof: number, optional + ddof: number The delta degrees of freedom. The number of degrees of freedom used in the calculation is (N-*ddof*) where N - represents the number of non-missing elements. By default - *ddof* is 0, for the biased variance. Setting ddof to - ``1`` applies Bessel's correction - (https://en.wikipedia.org/wiki/Bessel's_correction) - - computing_meta: `bool` optional - See `dask.array.reductions` for details. + represents the number of non-missing elements. A value of + 1 applies Bessel's correction. - kwargs: `dict`, optional - See `dask.array.reductions` for details. + See `dask.array.reductions` for further details. :Returns: @@ -1254,19 +1838,12 @@ def cf_var_chunk( Dictionary with the keys: * N: The sample size. - - * V1: The sum of ``weights`` (set to ``N`` if weights are - not set). - + * V1: The sum of ``weights`` (equal to ``N`` if weights + are not set). * sum: The weighted sum of ``x``. - * part: ``V1 * (sigma**2 + mu**2)``, where ``sigma**2`` is the weighted biased (i.e. ``ddof=0``) variance of - ``x``, and ``mu`` is the weighted mean of - ``x``. See - https://en.wikipedia.org/wiki/Pooled_variance#Sample-based_statistics - for details. - + ``x``, and ``mu`` is the weighted mean of ``x``. * V2: The sum of ``weights**2``. Only present if *weights* are set and ``ddof=1``. @@ -1292,7 +1869,7 @@ def cf_var_chunk( d["part"] = part if weights is not None and ddof == 1: - d["V2"] = sum_of_weights(x, weights, squared=True, **kwargs) + d["V2"] = sum_weights(x, weights, squared=True, **kwargs) return d @@ -1304,8 +1881,21 @@ def cf_var_combine( computing_meta=False, **kwargs, ): - """TODO.""" - d = {} + """Combine calculations for the variance. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details. + + :Returns: + + As for `cf_var_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] weighted = "V2" in flatten(pairs) @@ -1313,6 +1903,7 @@ def cf_var_combine( if weighted: keys += ("V1", "V2") + d = {} for key in keys: d[key] = sum_arrays(pairs, key, axis, dtype, computing_meta, **kwargs) if computing_meta: @@ -1321,7 +1912,7 @@ def cf_var_combine( d["N"] = sum_sample_sizes(pairs, axis, **kwargs) if not weighted: - d["V1"] = d["N"] + d["V1"] = d["N"].astype("f8") return d @@ -1332,73 +1923,82 @@ def cf_var_agg( dtype="f8", computing_meta=False, mtol=None, - original_shape=None, ddof=None, + original_shape=None, **kwargs, ): - """TODO.""" - d = cf_var_combine(pairs, axis, computing_meta, **kwargs) + """Aggregate calculations for the variance. + + This function is passed to `dask.array.reduction` as callable + *aggregate* parameter. + + .. note:: If weights are provided then they are interpreted as + reliability weights, as opposed to frequency weights. + + See + https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights + for details + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + ddof: number + The delta degrees of freedom. The number of degrees of + freedom used in the calculation is (N-*ddof*) where N + represents the number of non-missing elements. A value of + 1 applies Bessel's correction. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for further details. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_var_combine(pairs, axis, dtype, computing_meta, **kwargs) if computing_meta: return d V1 = d["V1"] - V2 = d.get("V2") - weighted = V2 is not None - wsum = d["sum"] var = d["part"] - wsum * wsum / V1 - # Note: var is currently the global value of V1 * sigma**2, where - # sigma is the global weighted biased (i.e. ddof=0) variance. + # Note: var is now the global value of V1 * sigma**2, where sigma + # is the global weighted biased (i.e. ddof=0) variance. - if ddof == 0: # intended equality with zero + V2 = d.get("V2") + weighted = V2 is not None + + if ddof is None: + raise ValueError(f"Must set ddof to a numeric value. Got: {ddof!r}") + + if not ddof: # Weighted or unweighted variance with ddof=0 f = 1 / V1 elif not weighted: - # Unweighted variance with any non-zero value of ddof. + # Unweighted variance with any non-zero value of ddof f = 1 / (V1 - ddof) elif ddof == 1: - # Weighted variance with ddof=1. For details see - # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights + # Weighted variance with ddof=1 f = V1 / (V1 * V1 - V2) else: raise ValueError( - "Can only calculate a weighted variance with ddof=0 or ddof=1: " - f"Got {ddof!r}" + "Can only calculate a weighted variance with ddof=0 or ddof=1. " + f"Got: {ddof!r}" ) - # Calculate the global variance, with the specified weighting and - # ddof. + # Now get the required global variance var = f * var - # Note: var is now the global value of sigma**2 - var = mask_small_sample_size(var, d["N"], axis, mtol, original_shape) return var - - -def cf_var( - a, - axis=None, - weights=None, - keepdims=False, - mtol=None, - ddof=None, - split_every=None, -): - """TODODASK.""" - dtype = float - return reduction( - a, - partial(cf_var_chunk, ddof=ddof), - partial(cf_var_agg, mtol=mtol, ddof=ddof, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_var_combine, - out=None, - concatenate=False, - meta=np.array((), dtype=dtype), - weights=weights, - ) diff --git a/cf/data/data.py b/cf/data/data.py index 9c85050a53..c3b1bcf5eb 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -48,6 +48,7 @@ NetCDFArray, UMArray, ) +from .collapse_functions import Collapse from .creation import ( compressed_to_dask, convert_to_builtin_type, @@ -68,10 +69,9 @@ from .mixin import DataClassDeprecationsMixin from .partition import Partition from .partitionmatrix import PartitionMatrix -from .utils import ( # is_small,; is_very_small, +from .utils import ( # is_small,; is_very_small,; collapse, YMDhms, _is_numeric_dtype, - collapse, conform_units, convert_to_datetime, convert_to_reftime, @@ -1951,7 +1951,17 @@ def median( inplace=False, _preserve_partitions=False, ): - """Compute the median of the values.""" + """Compute the median of the values. + + :Parameters: + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + """ return self.percentile( 50, axes=axes, @@ -1976,6 +1986,14 @@ def mean_of_upper_decile( Specifically, calculate the mean of the upper group of data values defined by the upper tenth of their distribution. + :Parameters: + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + """ d = _inplace_enabled_define_and_cleanup(self) @@ -2120,23 +2138,11 @@ def percentile( is guaranteed to broadcast correctly against the original data. - mtol: number, optional - Set an upper limit of the amount input data values - which are allowed to be missing data when contributing - to individual output percentile values. It is defined - as a fraction (between 0 and 1 inclusive) of the - contributing input data values. The default is 1, - meaning that a missing datum in the output array only - occurs when all of its contributing input array - elements are missing data. A value of 0 means that a - missing datum in the output array occurs whenever any - of its contributing input array elements are missing - data. + {{mtol: number, optional} - *Parameter example:* - To ensure that an output array element is a missing - value if more than 25% of its input array elements - are missing data: ``mtol=0.25``. + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -6932,11 +6938,22 @@ def max( inplace=False, i=False, ): - from .collapse_functions import cf_max + """TODO. + + :Parameters: + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + """ + # from .collapse_functions import cf_max d = _inplace_enabled_define_and_cleanup(self) - d, _ = collapse( - cf_maxn, + d, _ = _collapse( + Collapse.max, d, axis=axes, keepdims=not squeeze, @@ -6971,6 +6988,12 @@ def maximum( squeeze : bool, optional + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + {{inplace: `bool`, optional}} :Returns: @@ -7012,6 +7035,12 @@ def maximum_absolute_value( squeeze : bool, optional + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + {{inplace: `bool`, optional}} :Returns: @@ -7033,11 +7062,11 @@ def maximum_absolute_value( """ - from .collapse_functions import cf_max_abs + # from .collapse_functions import cf_max_abs d = _inplace_enabled_define_and_cleanup(self) - d, _ = collapse( - cf_max_abs, + d, _ = _collapse( + Collapse.max_abs, d, axis=axes, keepdims=not squeeze, @@ -7059,11 +7088,22 @@ def min( i=False, _preserve_partitions=False, ): - from .collapse_functions import cf_min + """TODO. + + :Parameters: + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + """ + # from .collapse_functions import cf_min d = _inplace_enabled_define_and_cleanup(self) - d, _ = collapse( - cf_min, + d, _ = _collapse( + Collapse.min, d, axis=axes, keepdims=not squeeze, @@ -7078,6 +7118,7 @@ def minimum( axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, i=False, _preserve_partitions=False, @@ -7123,6 +7164,7 @@ def minimum_absolute_value( axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, ): """Collapse axes with their minimum absolute value. @@ -7134,6 +7176,18 @@ def minimum_absolute_value( :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + {{inplace: `bool`, optional}} + :Returns: `Data` or `None` @@ -7153,11 +7207,11 @@ def minimum_absolute_value( """ - from .collapse_functions import cf_min_abs + # from .collapse_functions import cf_min_abs d = _inplace_enabled_define_and_cleanup(self) - d, _ = collapse( - cf_min_abs, + d, _ = _collapse( + Collapse.min_abs, d, axis=axes, keepdims=not squeeze, @@ -7187,12 +7241,26 @@ def mean( Missing data array elements and their corresponding weights are omitted from the calculation. - """ - from .collapse_functions import cf_mean + :Parameters: + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + {{inplace: `bool`, optional}} + + """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = collapse( - cf_mean, + d, _ = _collapse( + Collapse.mean, d, axis=axes, weights=weights, @@ -7223,6 +7291,18 @@ def mean_absolute_value( :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + {{inplace: `bool`, optional}} :Returns: @@ -7240,11 +7320,11 @@ def mean_absolute_value( """ - from .collapse_functions import cf_mean_abs + # from .collapse_functions import cf_mean_abs d = _inplace_enabled_define_and_cleanup(self) - d, _ = collapse( - ccf_mean_abs, + d, _ = _collapse( + Collapse.mean_abs, d, axis=axes, weights=weights, @@ -7274,43 +7354,35 @@ def integral( :Parameters: - {{collapse axes: (sequence of) int, optional}} + {{collapse axes: (sequence of) `int`, optional}} {{collapse squeeze: `bool`, optional}} - weights: data_like or dict, optional + {{weights: data_like, `dict`, or `None`, optional}} - Weights associated with values of the array. By - default all non-missing elements of the array are - assumed to have a weight equal to one. If *weights* is - a data_like object then it must have either the same - shape as the array or, if that is not the case, the - same shape as the axes being collapsed. + Note that the units of the weights matter for an + integral collapse, which differs from a weighted sum + in that the units of the weights are incorporated into + the result. + *Parameter example:* + If ``weights={1: w, (2, 0): x}`` then ``w`` must + contain 1-dimensional weights for axis 1 and ``x`` + must contain 2-dimensional weights for axes 2 and + 0. This is equivalent, for example, to + ``weights={(1, 2, 0), y}``, where ``y`` is the outer + product of ``w`` and ``x``. If ``axes=[1, 2, 0]`` + then ``weights={(1, 2, 0), y}`` is equivalent to + ``weights=y``. If ``axes=None`` and the array is + 3-dimensional then ``weights={(1, 2, 0), y}`` is + equivalent to ``weights=y.transpose([2, 0, 1])``. - If *weights* is a dictionary then each key specifies - axes of the array (an `int` or `tuple` of `int`), with - a corresponding value of data_like weights for those - axes. In this case, the implied weights array is the - outer product of the dictionary's values. + {{mtol: number, optional} - Note that the units of the weights matter for an integral - collapse, which differs from a weighted sum in that the units - of the weights are incorporated into the result. + {{split_every: `int` or `dict`, optional}} - *Parameter example:* - If ``weights={1: w, (2, 0): x}`` then ``w`` must contain - 1-dimensional weights for axis 1 and ``x`` must contain - 2-dimensional weights for axes 2 and 0. This is - equivalent, for example, to ``weights={(1, 2, 0), y}``, - where ``y`` is the outer product of ``w`` and ``x``. If - ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is - equivalent to ``weights=y``. If ``axes=None`` and the - array is 3-dimensional then ``weights={(1, 2, 0), y}`` - is equivalent to ``weights=y.transpose([2, 0, 1])``. - - mtol: number, optional + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -7326,11 +7398,11 @@ def integral( **Examples:** """ - from .collapse_functions import cf_sum + # from .collapse_functions import cf_sum d = _inplace_enabled_define_and_cleanup(self) - d, weights = collapse( - cf_sum, + d, weights = _collapse( + Collapse.sum, d, axis=axes, weights=weights, @@ -7366,17 +7438,29 @@ def sample_size( inplace=False, i=False, ): - from .collapse_functions import cf_sample_size + """TODO. + + :Parameters: + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + """ + # from .collapse_functions import cf_sample_size d = _inplace_enabled_define_and_cleanup(self) - d, _ = collapse( - cf_sample_size, + d, _ = _collapse( + Collapse.sample_size, d, axis=axes, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) + return d @property def binary_mask(self): @@ -9920,6 +10004,12 @@ def mid_range( :Parameters: + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + {{inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} @@ -9932,20 +10022,17 @@ def mid_range( **Examples:** """ - from .collapse_functions import cf_mid_range as collapse + # from .collapse_functions import cf_mid_range d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, + d, _ = _collapse( + Collapse.mid_range, + d, axis=axes, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - d._set_dask(dx, reset_mask_hardness=True) - return d @daskified(_DASKIFIED_VERBOSE) @@ -10129,6 +10216,71 @@ def isclose(self, y, rtol=None, atol=None): except (TypeError, NotImplementedError, IndexError): return self == y + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + def reshape(self, *shape, merge_chunks=True, limit=None, inplace=False): + """Change the shape of the data without changing its values. + + :Parameters: + + shape: `tuple` of `int`, or any number of `int` + The new shape for the data, which should be compatible + with the original shape. If an integer, then the + result will be a 1-d array of that length. One shape + dimension can be -1, in which case the value is + inferred from the length of the array and remaining + dimensions. + + merge_chunks: `bool` + When True (the default) merge chunks using the logic + in `dask.array.rechunk` when communication is + necessary given the input array chunking and the + output shape. When False, the input array will be + rechunked to a chunksize of 1, which can create very + many tasks. See `dask.array.reshape` for details. + + limit: int, optional + The maximum block size to target in bytes. If no limit + is provided, it defaults to the configuration value + ``dask.config.get('array.chunk-size')``. See + `dask.array.reshape` for details. + + :Returns: + + `Data` or `None` + The reshaped data, or `None` if the operation was + in-place. + + **Examples** + + >>> d = cf.Data(np.arange(12)) + >>> print(d.array) + [ 0 1 2 3 4 5 6 7 8 9 10 11] + >>> print(d.reshape(3, 4).array) + [[ 0 1 2 3] + [ 4 5 6 7] + [ 8 9 10 11]] + >>> print(d.reshape((4, 3)).array) + [[ 0 1 2] + [ 3 4 5] + [ 6 7 8] + [ 9 10 11]] + >>> print(d.reshape(-1, 6).array) + [[ 0 1 2 3 4 5] + [ 6 7 8 9 10 11]] + >>> print(d.reshape(1, 1, 2, 6).array) + [[[[ 0 1 2 3 4 5] + [ 6 7 8 9 10 11]]]] + >>> print(d.reshape(1, 1, -1).array) + [[[[ 0 1 2 3 4 5 6 7 8 9 10 11]]]] + + """ + d = _inplace_enabled_define_and_cleanup(self) + dx = d._get_dask() + dx = dx.reshape(*shape, merge_chunks=merge_chunks, limit=limit) + d._set_dask(dx, reset_mask_hardness=True) + return d + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) @@ -10165,14 +10317,16 @@ def rint(self, inplace=False, i=False): d._set_dask(da.rint(dx), reset_mask_hardness=False) return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def root_mean_square( self, axes=None, squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, - _preserve_partitions=False, ): """Collapse axes with their root mean square. @@ -10181,45 +10335,19 @@ def root_mean_square( :Parameters: - axes: (sequence of) int, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + {{collapse axes: (sequence of) `int`, optional}} - squeeze: `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1, meaning that the result is guaranteed to - broadcast correctly against the original array. + {{weights: data_like, `dict`, or `None`, optional}} - weights: data-like or dict, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - a weight equal to one. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values. + {{collapse squeeze: `bool`, optional}} - *Parameter example:* - If ``weights={1: w, (2, 0): x}`` then ``w`` must contain - 1-dimensional weights for axis 1 and ``x`` must contain - 2-dimensional weights for axes 2 and 0. This is - equivalent, for example, to ``weights={(1, 2, 0), y}``, - where ``y`` is the outer product of ``w`` and ``x``. If - ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is - equivalent to ``weights=y``. If ``axes=None`` and the - array is 3-dimensional then ``weights={(1, 2, 0), y}`` - is equivalent to ``weights=y.transpose([2, 0, 1])``. - - mtol: number, optional + {{mtol: number, optional} - {{inplace: `bool`, optional}} + {{split_every: `int` or `dict`, optional}} - {{i: deprecated at version 3.0.0}} + .. versionadded:: TODODASK + + {{inplace: `bool`, optional}} :Returns: @@ -10232,21 +10360,18 @@ def root_mean_square( **Examples:** """ - from .collapse_functions import cf_rms as collapse + # from .collapse_functions import cf_rms d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, + d, _ = _collapse( + Collapse.rms, + d, axis=axes, weights=weights, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - d._set_dask(dx, reset_mask_hardness=True) - return d @daskified(_DASKIFIED_VERBOSE) @@ -11858,6 +11983,12 @@ def range( default. A default can also be set globally with the ``split_every`` key in :mod:`dask.config`. + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + {{inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} @@ -11870,20 +12001,17 @@ def range( **Examples:** """ - from .collapse_functions import cf_range as collapse + # from .collapse_functions import cf_range d = _inplace_enabled_define_and_cleanup(self) - - dx = d._get_dask() - dx = collapse( - dx, + d, _ = _collapse( + Collapse.range, + d, axis=axes, keepdims=not squeeze, split_every=split_every, mtol=mtol, ) - d._set_dask(dx, reset_mask_hardness=True) - return d @daskified(_DASKIFIED_VERBOSE) @@ -11950,11 +12078,20 @@ def sum( split_every=None, i=False, ): - from .collapse_functions import cf_sum + """TODO. + + :Parameters: + + {{mtol: number, optional} + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = collapse( - cf_sum, + d, _ = _collapse( + Collapse.sum, d, axis=axes, weights=weights, @@ -11974,7 +12111,6 @@ def sum_of_squares( weights=None, split_every=None, inplace=False, - _preserve_partitions=False, ): """Collapse axes with the sum of the squares of the values. @@ -11986,9 +12122,17 @@ def sum_of_squares( :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} - squeeze : bool, optional + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -12007,11 +12151,9 @@ def sum_of_squares( """ - from .collapse_functions import cf_sum_of_squares - d = _inplace_enabled_define_and_cleanup(self) - d, _ = collapse( - cf_sum_of_squares, + d, _ = _collapse( + Collapse.sum_of_squares, d, axis=axes, weights=weights, @@ -12029,17 +12171,15 @@ def sum_of_squares( @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) - # @_deprecated_kwarg_check("i") def sum_of_weights( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): """Collapse axes with the sum of weights. @@ -12050,9 +12190,13 @@ def sum_of_weights( :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} - squeeze : bool, optional + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional} {{split_every: `int` or `dict`, optional}} @@ -12067,14 +12211,14 @@ def sum_of_weights( `Data` or `None` The collapsed array. - **Examples:** + **Examples** """ - from .collapse_functions import cf_sum_of_weights + # from .collapse_functions import cf_sum_of_weights d = _inplace_enabled_define_and_cleanup(self) - d, weights = collapse( - cf_sum_of_weights, + d, weights = _collapse( + Collapse.sum_of_weights, d, axis=axes, weights=weights, @@ -12099,13 +12243,12 @@ def sum_of_weights( def sum_of_weights2( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): """Collapse axes with the sum of squares of weights. @@ -12116,9 +12259,17 @@ def sum_of_weights2( :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} - squeeze : bool, optional + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -12129,14 +12280,14 @@ def sum_of_weights2( `Data` or `None` The collapsed array. - **Examples:** + **Examples** """ - from .collapse_functions import cf_sum_of_weights2 + # from .collapse_functions import cf_sum_of_weights2 d = _inplace_enabled_define_and_cleanup(self) - d, weights = collapse( - cf_sum_of_weights2, + d, weights = _collapse( + Collapse.sum_of_weights2, d, axis=axes, weights=weights, @@ -12148,7 +12299,7 @@ def sum_of_weights2( units = _units_None if weights is not None: units = getattr(weights, "Units", None) - if units is None: + if not units: units = _units_None else: units = units ** 2 @@ -12164,7 +12315,7 @@ def sd( squeeze=False, mtol=1, weights=None, - ddof=0, # TODASK: Is this the right default? + ddof=None, split_every=None, inplace=False, i=False, @@ -12267,22 +12418,13 @@ def sd( 0): x.outerproduct(y)}`` (see `outerproduct` for details). - mtol : number, optional - For each element in the output data array, the fraction of - contributing input array elements which is allowed to - contain missing data. Where this fraction exceeds *mtol*, - missing data is returned. The default is 1, meaning a - missing datum in the output array only occurs when its - contributing input array elements are all missing data. A - value of 0 means that a missing datum in the output array - occurs whenever any of its contributing input array - elements are missing data. Any intermediate value is - permitted. - - ddof : number, optional - The delta degrees of freedom. The number of degrees of - freedom used in the calculation is (N-*ddof*) where N - represents the number of elements. By default *ddof* is 0 + {{mtol: number, optional} + + {{ddof: number}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -12331,24 +12473,38 @@ def var( weights=None, squeeze=False, mtol=1, - ddof=0, + ddof=None, inplace=False, split_every=None, i=False, _preserve_partitions=False, ): - from .collapse_functions import cf_var + """TODO. + + :Parameters: + + {{mtol: number, optional} + + {{ddof: number}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + """ + if ddof is None: + raise ValueError("Must set the delta degrees of freedom (ddof)") d = _inplace_enabled_define_and_cleanup(self) - d, _ = collapse( - partial(cf_var, ddof=ddof), + d, _ = _collapse( + partial(Collapse.var, ddof=ddof), d, axis=axes, weights=weights, keepdims=not squeeze, mtol=mtol, - split_every=split_every, ddof=ddof, + split_every=split_every, ) units = d.Units @@ -12428,7 +12584,7 @@ def standard_deviation( squeeze=False, mtol=1, weights=None, - ddof=0, + ddof=None, inplace=False, i=False, ): @@ -12449,7 +12605,7 @@ def variance( squeeze=False, weights=None, mtol=1, - ddof=0, + ddof=None, inplace=False, i=False, ): @@ -12688,3 +12844,209 @@ def _where_broadcastable(data, x, name): ) return True + + +def _collapse( + func, + d, + axis=None, + weights=None, + keepdims=True, + mtol=1, + ddof=None, + split_every=None, +): + """Collapse data using a callable *func*. + + .. versionadded:: TODODASK + + .. seealso:: `_format_weights` + + :Parameters: + + func: callable + The function that collapses the underlying `dask` array of + *d*. Must have the minimum signature (parameters and + default values) ``func(dx, axis=None, keepdims=False, + mtol=None, split_every=None)`` (optionally including + ``weights=None`` or ``ddof=None``), where ``dx`` is a + `dask.array.Array` + + d: `Data` + The data to be collapsed. + + axis: (sequence of) int, optional + The axes to be collapsed. By default all axes are + collapsed, resulting in output with size 1. Each axis is + identified by its integer position. If *axes* is an empty + sequence then the collapse is applied to each scalar + element and the reuslt has the same shape as the input + data. + + weights: data_like, `dict`, or `None`, optional + Weights associated with values of the array. By default + all non-missing elements of the array are assumed to have + a weight equal to one. + + If *weights* is a data_like object then it must be + broadcastable to the array or, if that is not the case, + the same shape as the axes being collapsed. TODODASK - + scrib the last possibility? + + If *weights* is a dictionary then each key specifies axes + of the array (an `int` or `tuple` of `int`), with a + corresponding value of data_like weights for those + axes. The dimensions of a weights value must correspond to + its key axes in the same order. The weights that will be + used in the collapse will an outer product of the + dictionary's values. + + For collapses that do need need weights (such as a + maximum), *weights * must be `None` and *func* need not + support a ``weights`` parameter. + + keepdims: `bool`, optional + By default, the axes which are collapsed are left in the + result as dimensions with size one, so that the result + will broadcast correctly against the input array. If set + to False then collapsed axes are removed from the data. + + mtol: number, optional + Set the sampe size threshold below which collapsed values + are set to missing data. It is defined as a fraction + (between 0 and 1 inclusive) of the contributing input data + values. A missing datum in the output array occurs + whenever at least ``100*mtol%`` of its contributing input + array elements are missing data. + + ddof: number, optional + The delta degrees of freedom. The number of degrees of + freedom used in the calculation is (N-*ddof*) where N + represents the number of non-missing elements. + + For collapses that do need degrees of freedom (such as a + mean), *ddof * must be `None` and *func* need not support + a ``ddof`` parameter. + + split_every: `int` or `dict`, optional + Determines the depth of the recursive aggregation. See + `dask.array.reduction` for details. + + :Returns: + + `Data`, formatted weights + The collapsed data and the output of ``_format_weights(d, + weights, axis)``. + + """ + kwargs = { + "axis": axis, + "keepdims": keepdims, + "split_every": split_every, + "mtol": mtol, + } + + weights = _format_weights(d, weights, axis) + if weights is not None: + kwargs["weights"] = weights + + if ddof is not None: + kwargs["ddof"] = ddof + + dx = d.to_dask_array() + dx = func(dx, **kwargs) + d._set_dask(dx, reset_mask_hardness=True) + + return d, weights + + +def _format_weights(d, weights, axis=None): + """TODODASK. + + :Returns: + + `Data` or `None` + + **Examples** + + >>> d = cf.Data(np.arange(12)).reshape(4, 3) + + >>> print(cf.data.data._format_weights(d, None)) + None + + >>> cf.data.data._format_weights(d, [1, 2, 1], (0, 1)) + + + >>> cf.data.data._format_weights(d, [[1, 2, 1]], (0, 1)) + + + >>> cf.data.data._format_weights(d, {1: [1, 2, 1]}, (0, 1)) + + + >>> cf.data.data._format_weights( + ... d, {0: [1, 2, 3, 4], 1: [1, 2, 1}, (0, 1) + ... ) + TODODASK (need __mul__ to be daskified) + + """ + if weights is None: + # No weights + return + + if not isinstance(weights, dict): + # Weights is data_like. Don't check broadcastability to d, + # leave that to whatever uses the weights. + return Data.asdata(weights) + + if not weights: + # No weights (empty dictionary) + return + + if axis is None: + axis = tuple(range(d.ndim)) + else: + axis = d._parse_axes(axis) + + weights = weights.copy() + weights_axes = set() + for key, value in tuple(weights.items()): + del weights[key] + key = d._parse_axes(key) + if weights_axes.intersection(key): + raise ValueError("Duplicate weights axis") + + weights[tuple(key)] = value + weights_axes.update(key) + + if not weights_axes.intersection(axis): + # No weights span collapse axes + return + + # For each component, add missing dimensions as size 1. + w = [] + shape = d.shape + for key, value in weights.items(): + value = Data.asdata(value) + + # Make sure axes are in ascending order + skey = tuple(sorted(key)) + if key != skey: + value = value.transpose(skey) + key = skey + + if not all( + True if i in (j, 1) else False + for i, j in zip(value.shape, [shape[i] for i in key]) + ): + raise ValueError( + f"Weights component for axes {tuple(key)} with shape " + f"{value.shape} is not broadcastable to data with " + f"shape {shape}" + ) + + new_shape = [n if i in key else 1 for i, n in enumerate(shape)] + w.append(value.reshape(new_shape)) + + # Return the product of the weights components, which will be + # broadcastable to d + return reduce(mul, w) diff --git a/cf/data/utils.py b/cf/data/utils.py index 61c0a5c92a..cda7f77c6e 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -574,85 +574,3 @@ def YMDhms(d, attr): d._map_blocks(partial(cf_YMDhms, attr=attr), dtype=int) d.override_units(Units(None), inplace=True) return d - - -def format_weights(d, weights, axis=None): - """TODODASK.""" - if not isinstance(weights, dict): - return weights - - if not weights: - # No weights - return - - ndim = d.ndim - if axis is None: - axis = tuple(range(d.ndim)) - else: - axis = d._parse_axes(axis) - - weights = weights.copy() - weights_axes = set() - for key, value in tuple(weights.items()): - del weights[key] - key = tuple(d._parse_axes(key)) - if weights_axes.intersection(key): - raise ValueError("Duplicate weights axis") - - if value.ndim > ndim: TODO: test weights .shpae against impled eaxes shape - raise ValueError( - f"Weights component for axes {key} with shape " - f"{weights.shape} is not broadcastable to data with " - f"shape {d.shape}" - ) - - weights[key] = value - weights_axes.update(key) - - if not weights_axes.intersection(axis): - # No weights span collapse axes - return - - # For each componente, add missing dimensions as size 1. - w = [] - shape = d.shape - for key, value in weights.items(): - new_shape = [n if i in key else 1 for i, n in enumerate(shape)] - w.append(value.reshape(new_shape)) - - # Return the product of the weights components, which will be - # broadcastable to d - return reduce(mul, w) - - -def collapse( - func, - d, - axis=None, - weights=None, - keepdims=True, - mtol=1, - split_every=None, - ddof=None, -): - """TODODASK.""" - kwargs = { - "axis": axis, - "keepdims": keepdims, - "split_every": split_every, - "mtol": mtol, - } - - if weights is not None: - weights = format_weights(d, weights, axis) - if weights is not None: - kwargs["weights"] = weights - - if ddof is not None: - kwargs["ddof"] = ddof - - dx = d.to_dask_array() - dx = func(dx, **kwargs) - d._set_dask(dx, reset_mask_hardness=True) - - return d, weights diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index f3070edd49..5cb11f12b3 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -247,24 +247,33 @@ the computed non-parametric coodinates then this an empty tuple.""", # collapse axes - "{{collapse axes: (sequence of) int, optional}}": """axes: (sequence of) int, optional - The axes to be collapsed. By default flattened input - is used. Each axis is identified by its integer - position. No axes are collapsed if *axes* is an empty - sequence. TODODASK - is the axes=() behaviour - correct??""", - # collapse axes + "{{collapse axes: (sequence of) `int`, optional}}": """axes: (sequence of) `int`, optional + The axes to be collapsed. By default all axes are + collapsed, resulting in output with size 1. Each axis + is identified by its integer position. If *axes* is an + empty sequence then the collapse is applied to each + scalar element and the reuslt has the same shape as + the input data.""", + # collapse squeeze "{{collapse squeeze: `bool`, optional}}": """squeeze: `bool`, optional By default, the axes which are collapsed are left in the result as dimensions with size one, so that the result will broadcast correctly against the input array. If set to True then collapsed axes are removed from the data.""", - # collapse weights - "{{collapse weights: optional}}}": """weights: optional - Weights associated with values of the array. By - default all non-missing elements of the array are - assumed to have a weight equal to one. + # collapse keepdims + "{{collapse keepdims: `bool`, optional}}": """keepdims: `bool`, optional + By default, the axes which are collapsed are left in + the result as dimensions with size one, so that the + result will broadcast correctly against the input + array. If set to False then collapsed axes are removed + from the data.""", + # weights + "{{weights: data_like, `dict`, or `None`, optional}}": """weights: data_like, `dict`, or `None`, optional + Weights associated with values of the array. By + default *weights* is `None`, meaning that all + non-missing elements of the array are assumed to have + a weight equal to one. If *weights* is a data_like object then it must be broadcastable to the array or, if that is not the @@ -274,63 +283,63 @@ If *weights* is a dictionary then each key specifies axes of the array (an `int` or `tuple` of `int`), with a corresponding value of data_like weights for those - axes. In this case, the implied weights array is the - outer product of the dictionary's values. - - - Specify the weights for the collapse axes. The weights - are, in general, those that would be returned by this - call of the field construct's `weights` method: - ``f.weights(weights, axes=axes, measure=measure, - scale=scale, radius=radius, great_circle=great_circle, - components=True)``. See the *axes*, *measure*, - *scale*, *radius* and *great_circle* parameters and - `cf.Field.weights` for details. - - .. note:: By default *weights* is `None`, resulting in - **unweighted calculations**. - - If the alternative form of providing the collapse method - and axes combined as a CF cell methods-like string via the - *method* parameter has been used, then the *axes* - parameter is ignored and the axes are derived from the - *method* parameter. For example, if *method* is ``'T: - area: minimum'`` then this defines axes of ``['T', - 'area']``. If *method* specifies multiple collapses, - e.g. ``'T: minimum area: mean'`` then this implies axes of - ``'T'`` for the first collapse, and axes of ``'area'`` for - the second collapse. - - .. note:: Setting *weights* to `True` is generally a good - way to ensure that all collapses are - appropriately weighted according to the field - construct's metadata. In this case, if it is not - possible to create weights for any axis then an - exception will be raised. - - However, care needs to be taken if *weights* is - `True` when cell volume weights are desired. The - volume weights will be taken from a "volume" - cell measure construct if one exists, otherwise - the cell volumes will be calculated as being - proportional to the sizes of one-dimensional - vertical coordinate cells. In the latter case - **if the vertical dimension coordinates do not - define the actual height or depth thickness of - every cell in the domain then the weights will - be incorrect**. - - *Parameter example:* - To specify weights based on the field construct's - metadata for all collapse axes use ``weights=True``. - - *Parameter example:* - To specify weights based on cell areas use - ``weights='area'``. + axes. The dimensions of a weights value must + correspond to its key axes in the same order. The + weights that will be used in the collapse will an + outer product of the dictionary's values.""", + # collapse mtol + "{{mtol: number, optional}}": """mtol: number, optional + The sample size threshold below which collapsed values + are set to missing data. It is defined as a fraction + (between 0 and 1 inclusive) of the contributing input + data values. A missing datum in the output array + occurs whenever at least ``100*mtol%`` of its + contributing input array elements are missing + data. The default of *mtol* is 1, meaning that a + missing datum in the output array only occurs when all + of its contributing input array elements are missing + data. A value of 0 means that a missing datum in the + output array occurs whenever any of its contributing + input array elements are missing. Any intermediate + value is allowed. Note that for non-zero values of + *mtol*, different collapsed elements may have + different sample sizes, depending on the distribution + of missing data in the input data.""", + # ddof + "{{ddof: number}}": """ddof: number + The delta degrees of freedom. The number of degrees of + freedom used in the calculation is (N-*ddof*) where N + represents the number of non-missing elements. A value + of 1 applies Bessel's correction.""", + # split_every + "{{split_every: `int` or `dict`, optional}}": """split_every: `int` or `dict`, optional + Determines the depth of the recursive aggregation. If + set to or more than the number of input chunks, the + aggregation will be performed in two steps, one + partial collapse per input chunk and a single + aggregation at the end. If set to less than that, an + intermediate aggregation step will be used, so that + any of the intermediate or final aggregation steps + operates on no more than ``split_every`` inputs. The + depth of the aggregation graph will be + :math:`log_{split_every}(input chunks along reduced + axes)`. Setting to a low value can reduce cache size + and network transfers, at the cost of more CPU and a + larger dask graph. + + By default, `dask` heuristically decides on a good + value. A default can also be set globally with the + ``split_every`` key in `dask.config`. See + `dask.array.reduction` for details.""", + # Collapse weights + "{{Collapse weights: data_like or `None`, optional}}": """weights: data_like or `None`, optional + Weights associated with values of the array. By + default *weights* is `None`, meaning that all + non-missing elements of the array are assumed to have + a weight equal to one. - *Parameter example:* - To specify weights based on cell areas and linearly in - time you could set ``weights=('area', 'T')``.""", + When *weights* is a data_like object then it must have + the same shape as the array.""", # ---------------------------------------------------------------- # Method description susbstitutions (4 levels of indentataion) # ---------------------------------------------------------------- diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index f2ac2a8498..35f88762de 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -60,15 +60,22 @@ def reshape_array(a, axes): b = b.reshape(new_shape) return b - -class DataTest(unittest.TestCase): - - axes_combinations = [ +def axis_combinations(a): + return [ axes for n in range(1, a.ndim + 1) for axes in itertools.combinations(range(a.ndim), n) ] +class DataTest(unittest.TestCase): + + axes_combinations = axis_combinations(a) + #[ + # axes + # for n in range(1, a.ndim + 1) + # for axes in itertools.combinations(range(a.ndim), n) + #] + filename = os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_file.nc" ) @@ -1863,40 +1870,6 @@ def test_Data_flip(self): self.assertEqual(e[0].max().array, 3 * 4 * 5) self.assertEqual(e[-1].max().array, 4 * 5) - # def test_Data_max(self): - # if self.test_only and inspect.stack()[0][3] not in self.test_only: - # return - # - # d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre", chunks=2) - # self.assertEqual( - # d.max().array, cf.Data(6, "metre") - # ) - # self.assertEqual(d.max().array.datum(), 6) - # d[0, 2] = cf.masked - # self.assertEqual(d.max().array, 5) - # self.assertEqual(d.max().array.datum(), 5) - # self.assertEqual( - # d.maximum(_preserve_partitions=pp), cf.Data(0.005, "km") - # ) - # - # @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - # def test_Data_min(self): - # if self.test_only and inspect.stack()[0][3] not in self.test_only: - # return - # - # for pp in (False, True): - # d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") - # self.assertEqual( - # d.minimum(_preserve_partitions=pp), cf.Data(1, "metre") - # ) - # self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 1) - # d[1, 0] = cf.masked - # self.assertEqual(d.minimum(_preserve_partitions=pp), 2) - # self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 2) - # self.assertEqual( - # d.minimum(_preserve_partitions=pp), cf.Data(0.002, "km") - # ) - def test_Data_ndindex(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -2551,130 +2524,6 @@ def test_Data__collapse_SHAPE(self): ) # --- End: for - def test_Data_max_min(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - msg = None - - # unmasked - d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) - for _np, h in zip( - (np.amin, np.amax), - ("min", "max"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - if h == "sum_of_squares": - b = b ** 2 - - b = _np(b, axis=-1) - e = getattr(d, h)(axes=axes, squeeze=True) - if h == "sum_of_squares": - self.assertEqual(e.Units, cf.Units("m2")) - - # For debugging - # msg = (f"{h}, axis={axes}, unweighted, unmasked " - # f"\ne={e.array}, \nb={b}") - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) - - # masked - d = cf.Data(self.ma, "m", chunks=(2, 3, 2, 5)) - for _np, h in zip( - (np.ma.amin, np.ma.amax), - ("min", "max"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - if h == "sum_of_squares": - b = b ** 2 - - b = _np(b, axis=-1) - b = np.ma.asanyarray(b) - e = getattr(d, h)(axes=axes, squeeze=True) - if h == "sum_of_squares": - self.assertEqual(e.Units, cf.Units("m2")) - - # For debugging - # msg = (f"{h}, axis={axes}, unweighted, unmasked " - # f"\ne.mask={e.mask.array}, \nb={b}") - - self.assertTrue((e.mask.array == b.mask).all(), msg) - - # For debugging - # msg = (f"{h}, axis={axes}, unweighted, unmasked " - # f"\ne={e.array}, \nb={b}") - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) - - def test_Data_sum_sum_of_squares(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - msg = None - - # unweighted, unmasked - d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) - for _np, h in zip( - (np.sum, np.sum), - ("sum", "sum_of_squares"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - if h == "sum_of_squares": - b = b ** 2 - - b = _np(b, axis=-1) - e = getattr(d, h)(axes=axes, squeeze=True) - if h == "sum_of_squares": - self.assertEqual(e.Units, cf.Units("m2")) - - # For debugging - # msg = (f"{h}, axis={axes}, unweighted, unmasked " - # f"\ne={e.array}, \nb={b}") - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) - - # unweighted, masked - d = cf.Data(self.ma, "m", chunks=(2, 3, 2, 5)) - for _np, h in zip( - (np.ma.sum, np.ma.sum), - ("sum", "sum_of_squares"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - if h == "sum_of_squares": - b = b ** 2 - - b = _np(b, axis=-1) - b = np.ma.asanyarray(b) - e = getattr(d, h)(axes=axes, squeeze=True) - if h == "sum_of_squares": - self.assertEqual(e.Units, cf.Units("m2")) - - # For debugging - # msg = (f"{h}, axis={axes}, unweighted, unmasked " - # f"\ne.mask={e.mask.array}, \nb={b}") - - self.assertTrue((e.mask.array == b.mask).all(), msg) - - # For debugging - # msg = (f"{h}, axis={axes}, unweighted, unmasked " - # f"\ne={e.array}, \nb={b}") - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) - - # Need to do weighted - def test_Data_percentile_median(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -3071,101 +2920,101 @@ def test_Data_sum_of_weights_sum_of_weights2(self): ), ) - def test_Data_sum_mean_mean_absolute_value(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - msg = None - - for absolute in (False, True): - a = self.a - ma = self.ma - method = "mean" - if absolute: - a = np.absolute(a) - ma = np.absolute(ma) - method = "mean_absolute_value" - - # unweighted, unmasked - d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) - for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = np.mean(b, axis=-1) - e = getattr(d, method)(axes=axes, squeeze=True) - - # For debugging - # msg = (f"{method} unweighted, unmasked, axis={axes}, " - # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) - - # weighted, unmasked - x = cf.Data(self.w) - for axes in self.axes_combinations: - b = reshape_array(a, axes) - v = reshape_array(self.w, axes) - b = np.average(b, axis=-1, weights=v) - - e = getattr(d, method)(axes=axes, weights=x, squeeze=True) - - # For debugging - # msg = (f"{method} weighted, unmasked, axis={axes}, " - # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) - - # unweighted, masked - d = cf.Data(self.ma, "m", chunks=(2, 3, 2, 5)) - for axes in self.axes_combinations: - b = reshape_array(ma, axes) - b = np.ma.average(b, axis=-1) - b = np.ma.asanyarray(b) - - e = getattr(d, method)(axes=axes, squeeze=True) - - # For debugging - # msg = (f"{method} unweighted, masked, axis={axes}, " - # f"\ne.mask={e.mask.array}, \nb={b}") - - self.assertTrue((e.mask.array == b.mask).all(), msg) - - # For debugging - # msg = (f"{method} unweighted, masked, axis={axes}, " - # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) - - # weighted, masked - for axes in self.axes_combinations: - print(axes) - b = reshape_array(ma, axes) - v = reshape_array(self.mw, axes) - b = np.ma.average(b, axis=-1, weights=v) - b = np.ma.asanyarray(b) - - e = getattr(d, method)(axes=axes, weights=x, squeeze=True) - - # For debugging - # msg = (f"{method} weighted, masked, axis={axes}, " - # f"\ne.mask={e.mask.array}, \nb={b}") - - self.assertTrue((e.mask.array == b.mask).all(), msg) - - # For debugging - msg = ( - f"{method} weighted, masked, axis={axes}, " - f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}" - ) - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) +# def test_Data_sum_mean_mean_absolute_value(self): +# if self.test_only and inspect.stack()[0][3] not in self.test_only: +# return +# +# msg = None +# +# for absolute in (False, True): +# a = self.a +# ma = self.ma +# method = "mean" +# if absolute: +# a = np.absolute(a) +# ma = np.absolute(ma) +# method = "mean_absolute_value" +# +# # unweighted, unmasked +# d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) +# for axes in self.axes_combinations: +# b = reshape_array(a, axes) +# b = np.mean(b, axis=-1) +# e = getattr(d, method)(axes=axes, squeeze=True) +# +# # For debugging +# # msg = (f"{method} unweighted, unmasked, axis={axes}, " +# # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") +# +# self.assertTrue( +# np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg +# ) +# +# # weighted, unmasked +# x = cf.Data(self.w) +# for axes in self.axes_combinations: +# b = reshape_array(a, axes) +# v = reshape_array(self.w, axes) +# b = np.average(b, axis=-1, weights=v) +# +# e = getattr(d, method)(axes=axes, weights=x, squeeze=True) +# +# # For debugging +# # msg = (f"{method} weighted, unmasked, axis={axes}, " +# # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") +# +# self.assertTrue( +# np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg +# ) +# +# # unweighted, masked +# d = cf.Data(self.ma, "m", chunks=(2, 3, 2, 5)) +# for axes in self.axes_combinations: +# b = reshape_array(ma, axes) +# b = np.ma.average(b, axis=-1) +# b = np.ma.asanyarray(b) +# +# e = getattr(d, method)(axes=axes, squeeze=True) +# +# # For debugging +# # msg = (f"{method} unweighted, masked, axis={axes}, " +# # f"\ne.mask={e.mask.array}, \nb={b}") +# +# self.assertTrue((e.mask.array == b.mask).all(), msg) +# +# # For debugging +# # msg = (f"{method} unweighted, masked, axis={axes}, " +# # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") +# +# self.assertTrue( +# np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg +# ) +# +# # weighted, masked +# for axes in self.axes_combinations: +# print(axes) +# b = reshape_array(ma, axes) +# v = reshape_array(self.mw, axes) +# b = np.ma.average(b, axis=-1, weights=v) +# b = np.ma.asanyarray(b) +# +# e = getattr(d, method)(axes=axes, weights=x, squeeze=True) +# +# # For debugging +# # msg = (f"{method} weighted, masked, axis={axes}, " +# # f"\ne.mask={e.mask.array}, \nb={b}") +# +# self.assertTrue((e.mask.array == b.mask).all(), msg) +# +# # For debugging +# msg = ( +# f"{method} weighted, masked, axis={axes}, " +# f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}" +# ) +# +# self.assertTrue( +# np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg +# ) @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_root_mean_square(self): @@ -3923,8 +3772,333 @@ def test_Data_change_calendar(self): # calendar). with self.assertRaises(ValueError): e = d.change_calendar("noleap").array + + def test_Data_max(self): + # Masked array + a = self.ma + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.max(b, axis=-1) + b = np.ma.asanyarray(b) + e = d.max(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_maximum_absolute_value(self): + # Masked array + a = self.ma + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.max(abs(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.maximum_absolute_value(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_mean(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.average(b, axis=-1, weights=w) + b = np.ma.asanyarray(b) + e = d.mean(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_mean_absolute_value(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.average(abs(b), axis=-1, weights=w) + b = np.ma.asanyarray(b) + + e = d.mean_absolute_value(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_mid_range(self): + # Masked array, non-masked weights + a = self.ma + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = (np.max(b, axis=-1) + np.min(b, axis=-1)) / 2.0 + b = np.ma.asanyarray(b) + + e = d.mid_range(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_min(self): + # Masked array + a = self.ma + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.min(b, axis=-1) + b = np.ma.asanyarray(b) + + e = d.min(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_minimum_absolute_value(self): + # Masked array + a = self.ma + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.min(abs(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.minimum_absolute_value(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_range(self): + # Masked array + a = self.ma + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.max(b, axis=-1) - np.min(b, axis=-1) + b = np.ma.asanyarray(b) + + e = d.range(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_root_mean_square(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.average(b * b, axis=-1, weights=w) ** 0.5 + b = np.ma.asanyarray(b) + + e = d.root_mean_square(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sample_size(self): + # Masked array + a = self.ma + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.sum(np.ones_like(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.sample_size(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + # Non-masked array + a = self.a + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.sum(np.ones_like(b), axis=-1) + b = np.asanyarray(b) + + e = d.sample_size(axes=axis, squeeze=True) + e = np.array(e.array) + + self.assertTrue(np.allclose(e, b)) + + def test_Data_sum(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.sum(b * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sum_of_squares(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.sum(b * b * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_squares(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sum_of_weights(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + w = np.ma.masked_where(b.mask, w) + b = np.sum(w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_weights(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sum_of_weights2(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + w = np.ma.masked_where(b.mask, w) + b = np.sum(w * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_weights2(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + e = d.sum_of_weights2(weights=weights) + self.assertEqual(e.Units, cf.Units()) + + e = d.sum_of_weights2(weights=cf.Data(weights, 'km')) + self.assertEqual(e.Units, cf.Units('km2')) + + + def test_Data_var(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + + # ddof = 0 + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + mu, V1 = np.ma.average(b, axis=-1, weights=w, returned=True) + mu = mu.reshape(mu.shape + (1,)) + w = np.ma.masked_where(b.mask, w) + + b = np.sum(w * (b - mu)**2, axis=-1) + b = b / V1 + b = np.ma.asanyarray(b) + + e = d.var(axes=axis, weights=weights, ddof=0, + squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b), e-b) + + # ddof = 1 + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + mu, V1 = np.ma.average(b, axis=-1, weights=w, returned=True) + mu = mu.reshape(mu.shape + (1,)) + w = np.ma.masked_where(b.mask, w) + V2 = np.sum(w * w, axis=-1) + + b = np.sum(w * (b - mu)**2, axis=-1) + b = b / (V1 - (V2/V1)) + b = np.ma.asanyarray(b) + + e = d.var(axes=axis, weights=weights, ddof=1, + squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + # Unweighted ddof = 1 + for axis in axis_combinations(a): + b = reshape_array(a, axis) + mu, V1 = np.ma.average(b, axis=-1, returned=True) + mu = mu.reshape(mu.shape + (1,)) + w = np.ma.masked_where(b.mask, w) + + b = np.sum(w * (b - mu)**2, axis=-1) + b = b / (V1 - 1) + b = np.ma.asanyarray(b) + + e = d.var(axes=axis, ddof=1, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + e = d.var(ddof=0) + self.assertEqual(e.Units, cf.Units("m2")) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) cf.environment() From 9ff92ca7e9d1b7ba465638cb1cd8e4f706fc7eb9 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 16 Mar 2022 17:42:24 +0000 Subject: [PATCH 09/37] dev --- cf/__init__.py | 19 +- cf/constants.py | 19 +- cf/data/collapse_functions.py | 282 +++++--- cf/data/data.py | 1176 ++++++++++++++++++++++----------- cf/docstring/docstring.py | 37 +- cf/functions.py | 13 +- cf/test/test_Data.py | 1056 +++++++++-------------------- requirements.txt | 1 + 8 files changed, 1347 insertions(+), 1256 deletions(-) diff --git a/cf/__init__.py b/cf/__init__.py index 4d421c3ab3..b1aceef9a6 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -78,7 +78,7 @@ __date__ = "2022-01-18" __version__ = "4.0.0b0" -_requires = ("numpy", "netCDF4", "cftime", "cfunits", "cfdm", "psutil") +_requires = ("numpy", "netCDF4", "cftime", "cfunits", "cfdm", "psutil", "dask") x = ", ".join(_requires) _error0 = f"cf v{ __version__} requires the modules {x}. " @@ -104,10 +104,6 @@ _found_ESMF = bool(importlib.util.find_spec("ESMF")) -# TODODASK - Remove the next 2 lines when the move to dask is complete -mpi_on = False -mpi_size = 1 - try: import netCDF4 except ImportError as error1: @@ -133,6 +129,11 @@ except ImportError as error1: raise ImportError(_error0 + str(error1)) +try: + import dask +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + # Check the version of psutil _minimum_vn = "0.6.0" if LooseVersion(psutil.__version__) < LooseVersion(_minimum_vn): @@ -183,6 +184,14 @@ f"Got {_cfdm_version} at {cfdm.__file__}" ) +# Check the version of dask +_minimum_vn = "2020.2.1" +if LooseVersion(dask.__version__) < LooseVersion(_minimum_vn): + raise RuntimeError( + f"Bad dask version: cf requires dask>={_minimum_vn}. " + f"Got {dask.__version__} at {dask.__file__}" + ) + from .constructs import Constructs from .mixin import Coordinate diff --git a/cf/constants.py b/cf/constants.py index 07fc689ee1..2af292d6d4 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -6,11 +6,6 @@ from numpy.ma import masked as numpy_ma_masked from psutil import virtual_memory -from . import mpi_on, mpi_size - -if mpi_on: - from . import mpi_comm - from .units import Units # platform = sys.platform @@ -78,9 +73,8 @@ disabled. FREE_MEMORY_FACTOR: `int` - Factor to divide the free memory by. If MPI is on this is equal - to the number of PEs. Otherwise it is equal to 1 and is ignored - in any case. + Factor to divide the free memory by. It is equal to 1 and is + ignored in any case. COLLAPSE_PARALLEL_MODE: `int` The mode to use when parallelising collapse. By default this is @@ -113,17 +107,12 @@ CONSTANTS["FREE_MEMORY_FACTOR"] * CONSTANTS["TOTAL_MEMORY"] ) -if mpi_on: - CONSTANTS["MIN_TOTAL_MEMORY"] = min( - mpi_comm.allgather(CONSTANTS["TOTAL_MEMORY"]) - ) -else: - CONSTANTS["MIN_TOTAL_MEMORY"] = CONSTANTS["TOTAL_MEMORY"] +CONSTANTS["MIN_TOTAL_MEMORY"] = CONSTANTS["TOTAL_MEMORY"] CONSTANTS["CHUNKSIZE"] = ( CONSTANTS["FREE_MEMORY_FACTOR"] * CONSTANTS["MIN_TOTAL_MEMORY"] ) / ( - mpi_size * CONSTANTS["WORKSPACE_FACTOR_1"] + CONSTANTS["WORKSPACE_FACTOR_1"] + CONSTANTS["WORKSPACE_FACTOR_2"] ) diff --git a/cf/data/collapse_functions.py b/cf/data/collapse_functions.py index 3985a9c528..835ceb9864 100644 --- a/cf/data/collapse_functions.py +++ b/cf/data/collapse_functions.py @@ -24,6 +24,10 @@ def max(a, axis=None, keepdims=False, mtol=None, split_every=None): Calculates the maximum value of an array or the maximum values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -66,6 +70,10 @@ def max_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): Calculates the maximum absolute value of an array or the maximum absolute values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -110,6 +118,10 @@ def mean( Calculates the mean value of an array or the mean values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -152,11 +164,15 @@ def mean( def mean_abs( a, weights=None, axis=None, keepdims=False, mtol=None, split_every=None ): - """"Return mean absolute values of an array. + """Return mean absolute values of an array. Calculates the mean absolute value of an array or the mean absolute values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -204,6 +220,10 @@ def mid_range( Calculates the mid-range value of an array or the mid-range values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -246,6 +266,10 @@ def min(a, axis=None, keepdims=False, mtol=None, split_every=None): Calculates the minimum value of an array or the minimum values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -288,8 +312,11 @@ def min_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): Calculates the minimum absolute value of an array or the minimum absolute values along axes. - .. versionadded:: TODODASK + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -331,6 +358,10 @@ def range(a, axis=None, keepdims=False, mtol=None, split_every=None): Calculates the range value of an array or the range values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -375,6 +406,10 @@ def rms( Calculates the RMS value of an array or the RMS values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -420,6 +455,10 @@ def sample_size(a, axis=None, keepdims=False, mtol=None, split_every=None): Calculates the sample size value of an array or the sample size values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -464,8 +503,11 @@ def sum( Calculates the sum value of an array or the sum values along axes. - .. versionadded:: TODODASK + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -507,56 +549,60 @@ def sum( weights=weights, ) - @staticmethod - def sum_of_squares( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None - ): - """Return sum of square values of an array. - - Calculates the sum of square value of an array or the sum of - square values along axes. - - .. versionadded:: TODODASK - - :Parameters: - - a: `dask.array.Array` - The array to be collapsed. - - {{Collapse weights: data_like or `None`, optional}} - - {{collapse axes: (sequence of) `int`, optional}} - - {{collapse keepdims: `bool`, optional}} - - {{mtol: number, optional} - - {{split_every: `int` or `dict`, optional}} - - :Returns: - - `dask.array.Array` - The collapsed array. - - """ - if weights is None: - dtype = double_precision_dtype(a) - else: - dtype = "f8" - - return reduction( - a, - partial(cf_sum_chunk, squared=True), - partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), - axis=axis, - keepdims=keepdims, - dtype=dtype, - split_every=split_every, - combine=cf_sum_combine, - concatenate=False, - meta=np.array((), dtype=dtype), - weights=weights, - ) + # @staticmethod + # def sum_of_squares( + # a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + # ): + # """Return sum of square values of an array. + # + # Calculates the sum of square value of an array or the sum of + # square values along axes. + # + # See + # https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + # for mathematical definitions. + # + # .. versionadded:: TODODASK + # + # :Parameters: + # + # a: `dask.array.Array` + # The array to be collapsed. + # + # {{Collapse weights: data_like or `None`, optional}} + # + # {{collapse axes: (sequence of) `int`, optional}} + # + # {{collapse keepdims: `bool`, optional}} + # + # {{mtol: number, optional} + # + # {{split_every: `int` or `dict`, optional}} + # + # :Returns: + # + # `dask.array.Array` + # The collapsed array. + # + # """ + # if weights is None: + # dtype = double_precision_dtype(a) + # else: + # dtype = "f8" + # + # return reduction( + # a, + # partial(cf_sum_chunk, squared=True), + # partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + # axis=axis, + # keepdims=keepdims, + # dtype=dtype, + # split_every=split_every, + # combine=cf_sum_combine, + # concatenate=False, + # meta=np.array((), dtype=dtype), + # weights=weights, + # ) @staticmethod def sum_of_weights( @@ -567,6 +613,10 @@ def sum_of_weights( Calculates the sum of weights value for an array or the sum of weights values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -614,6 +664,10 @@ def sum_of_weights2( Calculates the sum of squares of weights value for an array or the sum of squares of weights values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -667,6 +721,10 @@ def var( Calculates the variance value of an array or the variance values along axes. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + .. versionadded:: TODODASK :Parameters: @@ -696,7 +754,7 @@ def var( return reduction( a, partial(cf_var_chunk, ddof=ddof), - partial(cf_var_agg, mtol=mtol, ddof=ddof, original_shape=a.shape), + partial(cf_var_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, dtype=dtype, @@ -723,7 +781,7 @@ def double_precision_dtype(a): **Examples** >>> for d in (float, 'float32', int, 'int32'): - ... print(double_precision_dtype(np.array(9, dtype=d))) + ... print(double_precision_dtype(np.array(1, dtype=d))) ... f8 f8 @@ -792,7 +850,7 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): return x -def sum_weights(x, weights=None, squared=False, dtype="f8", N=None, **kwargs): +def sum_weights(x, weights=None, squared=False, N=None, dtype="f8", **kwargs): """TODO. .. versionadded:: TODODASK @@ -808,9 +866,9 @@ def sum_weights(x, weights=None, squared=False, dtype="f8", N=None, **kwargs): # the squares of the weights are both equal to the sample # size. if N is None: - return cf_sample_size_chunk(x, dtype=dtype, **kwargs)["N"] + return cf_sample_size_chunk(x, **kwargs)["N"] - return N.astype(dtype) + return N if squared: weights = np.multiply(weights, weights, dtype=dtype) @@ -831,6 +889,8 @@ def combine_arrays( along the axes, and apply a function to the result along the same axes. + .. versionadded:: TODODASK + :Returns: `numpy.ndarray` @@ -846,7 +906,11 @@ def combine_arrays( def sum_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): - """Alias of `combine_arrays` with ``func=chunk.sum``.""" + """Alias of `combine_arrays` with ``func=chunk.sum``. + + .. versionadded:: TODODASK + + """ return combine_arrays( pairs, key, chunk.sum, axis, dtype, computing_meta, **kwargs ) @@ -864,7 +928,11 @@ def max_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): def min_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): - """Alias of `combine_arrays` with ``func=chunk.min``.""" + """Alias of `combine_arrays` with ``func=chunk.min``. + + .. versionadded:: TODODASK + + """ return combine_arrays( pairs, key, chunk.min, axis, dtype, computing_meta, **kwargs ) @@ -915,6 +983,8 @@ def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): d["V1"] = sum_weights(x, weights, N=d["N"], **kwargs) + d["weighted"] = weights is not None + return d @@ -941,13 +1011,21 @@ def cf_mean_combine( if not isinstance(pairs, list): pairs = [pairs] - d = {} - for key in ("sum", "V1"): - d[key] = sum_arrays(pairs, key, axis, dtype, computing_meta, **kwargs) - if computing_meta: - return d[key] + d = {"weighted": next(flatten(pairs))["weighted"]} + + d["sum"] = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d["sum"] d["N"] = sum_sample_sizes(pairs, axis, **kwargs) + + if d["weighted"]: + d["V1"] = sum_arrays( + pairs, "V1", axis, dtype, computing_meta, **kwargs + ) + else: + d["V1"] = d["N"] + return d @@ -960,7 +1038,7 @@ def cf_mean_agg( original_shape=None, **kwargs, ): - """"Aggregate calculations for the mean. + """Aggregate calculations for the mean. This function is passed to `dask.array.reduction` as callable *aggregate* parameter. @@ -1525,7 +1603,7 @@ def cf_rms_agg( `dask.array.Array` The collapsed array. - """ + """ d = cf_mean_combine(pairs, axis, dtype, computing_meta, **kwargs) if computing_meta: return d @@ -1799,9 +1877,10 @@ def cf_sum_of_weights_chunk( # N d = cf_sample_size_chunk(x, **kwargs) - # sum - d["sum"] = sum_weights(x, weights=weights, squared=squared, - N=d["N"], **kwargs ) + d["sum"] = sum_weights( + x, weights=weights, squared=squared, N=d["N"], **kwargs + ) + return d @@ -1840,12 +1919,13 @@ def cf_var_chunk( * N: The sample size. * V1: The sum of ``weights`` (equal to ``N`` if weights are not set). + * V2: The sum of ``weights**2``. * sum: The weighted sum of ``x``. * part: ``V1 * (sigma**2 + mu**2)``, where ``sigma**2`` is the weighted biased (i.e. ``ddof=0``) variance of ``x``, and ``mu`` is the weighted mean of ``x``. - * V2: The sum of ``weights**2``. Only present if *weights* - are set and ``ddof=1``. + * weighted: True if weights have been set. + * ddof: The delta degrees of freedom. """ if computing_meta: @@ -1857,7 +1937,6 @@ def cf_var_chunk( wsum = d["sum"] V1 = d["V1"] - # part avg = divide(wsum, V1, dtype=dtype) part = x - avg part *= part @@ -1866,10 +1945,16 @@ def cf_var_chunk( part = chunk.sum(part, dtype=dtype, **kwargs) part = part + avg * wsum + d["part"] = part - if weights is not None and ddof == 1: - d["V2"] = sum_weights(x, weights, squared=True, **kwargs) + if ddof == 1: + d["V2"] = sum_weights(x, weights, squared=True, N=d["N"], **kwargs) + else: + d["V2"] = d["N"] + + d["weighted"] = weights is not None + d["ddof"] = ddof return d @@ -1897,22 +1982,32 @@ def cf_var_combine( if not isinstance(pairs, list): pairs = [pairs] - weighted = "V2" in flatten(pairs) + d = next(flatten(pairs)) + weighted = d["weighted"] + ddof = d["ddof"] - keys = ("part", "sum") - if weighted: - keys += ("V1", "V2") + d = {"weighted": weighted, "ddof": ddof} - d = {} - for key in keys: - d[key] = sum_arrays(pairs, key, axis, dtype, computing_meta, **kwargs) - if computing_meta: - return d[key] + d["part"] = sum_arrays( + pairs, "part", axis, dtype, computing_meta, **kwargs + ) + if computing_meta: + return d["part"] d["N"] = sum_sample_sizes(pairs, axis, **kwargs) - if not weighted: - d["V1"] = d["N"].astype("f8") + d["sum"] = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) + + d["V1"] = d["N"] + d["V2"] = d["N"] + if weighted: + d["V1"] = sum_arrays( + pairs, "V1", axis, dtype, computing_meta, **kwargs + ) + if ddof == 1: + d["V2"] = sum_arrays( + pairs, "V2", axis, dtype, computing_meta, **kwargs + ) return d @@ -1923,7 +2018,6 @@ def cf_var_agg( dtype="f8", computing_meta=False, mtol=None, - ddof=None, original_shape=None, **kwargs, ): @@ -1948,12 +2042,6 @@ def cf_var_agg( set to missing data. See `mask_small_sample_size` for details. - ddof: number - The delta degrees of freedom. The number of degrees of - freedom used in the calculation is (N-*ddof*) where N - represents the number of non-missing elements. A value of - 1 applies Bessel's correction. - original_shape: `tuple` The shape of the original, uncollapsed data. @@ -1969,6 +2057,7 @@ def cf_var_agg( if computing_meta: return d + ddof = d["ddof"] V1 = d["V1"] wsum = d["sum"] var = d["part"] - wsum * wsum / V1 @@ -1976,21 +2065,18 @@ def cf_var_agg( # Note: var is now the global value of V1 * sigma**2, where sigma # is the global weighted biased (i.e. ddof=0) variance. - V2 = d.get("V2") - weighted = V2 is not None - if ddof is None: raise ValueError(f"Must set ddof to a numeric value. Got: {ddof!r}") if not ddof: # Weighted or unweighted variance with ddof=0 f = 1 / V1 - elif not weighted: + elif not d["weighted"]: # Unweighted variance with any non-zero value of ddof f = 1 / (V1 - ddof) elif ddof == 1: # Weighted variance with ddof=1 - f = V1 / (V1 * V1 - V2) + f = V1 / (V1 * V1 - d["V2"]) else: raise ValueError( "Can only calculate a weighted variance with ddof=0 or ddof=1. " diff --git a/cf/data/data.py b/cf/data/data.py index c3b1bcf5eb..eeacd31b89 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1949,17 +1949,45 @@ def median( squeeze=False, mtol=1, inplace=False, - _preserve_partitions=False, ): - """Compute the median of the values. + """Calculate median values. + + Calculates the median value or the median values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `mean_of_upper_decile`, `percentile` :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + {{mtol: number, optional} - {{split_every: `int` or `dict`, optional}} + {{inplace: `bool`, optional}} - .. versionadded:: TODODASK + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2]) + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.median() + """ return self.percentile( @@ -1974,27 +2002,61 @@ def median( def mean_of_upper_decile( self, axes=None, - include_decile=True, - squeeze=False, weights=None, + squeeze=False, mtol=1, + include_decile=True, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Compute the mean the of upper decile. + """Calculate means of the upper deciles. + + Calculates the mean of the upper decile or the mean or the + mean of the upper decile values along axes. - Specifically, calculate the mean of the upper group of data - values defined by the upper tenth of their distribution. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `mean`, `median`, `percentile` :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + {{mtol: number, optional} + TODODASK - note that mtol onlty applies calculation of + uppder decile, not the mean! + + include_decile: `bool`, optional + TODODASK {{split_every: `int` or `dict`, optional}} .. versionadded:: TODODASK + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + TODODASK + """ + + # TODODASK: Some updates off the back of collapse done, but + # still needs looking at. Unit test has also been + # written, but not run. Needs __lt__ and __le__. + d = _inplace_enabled_define_and_cleanup(self) p90 = d.percentile( @@ -2003,7 +2065,6 @@ def mean_of_upper_decile( squeeze=False, mtol=mtol, inplace=False, - _preserve_partitions=_preserve_partitions, ) with numpy_testing_suppress_warnings() as sup: @@ -2014,7 +2075,6 @@ def mean_of_upper_decile( mask = d < p90 else: mask = d <= p90 - # --- End: with if mtol < 1: mask.filled(False, inplace=True) @@ -2023,10 +2083,11 @@ def mean_of_upper_decile( d.mean( axes=axes, - squeeze=squeeze, weights=weights, + squeeze=squeeze, + mtol=1, + split_every=split_every, inplace=True, - _preserve_partitions=_preserve_partitions, ) return d @@ -6938,55 +6999,21 @@ def max( inplace=False, i=False, ): - """TODO. - - :Parameters: - - {{mtol: number, optional} - - {{split_every: `int` or `dict`, optional}} - - .. versionadded:: TODODASK - - """ - # from .collapse_functions import cf_max - - d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( - Collapse.max, - d, - axis=axes, - keepdims=not squeeze, - split_every=split_every, - mtol=mtol, - ) - - return d - - @_deprecated_kwarg_check("i") - def maximum( - self, - axes=None, - squeeze=False, - mtol=1, - split_every=None, - inplace=False, - i=False, - _preserve_partitions=False, - ): - """Alias. + """Calculate maximum values. - Collapse axes with their maximum. + Calculates the maximum value or the maximum values along axes. - Missing data array elements are omitted from the calculation. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - .. seealso:: `minimum`, `mean`, `mid_range`, `sum`, `sd`, `var` + ..seealso:: `maximum_absolute_value`, `min` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} - squeeze : bool, optional + {{collapse squeeze: `bool`, optional}} {{mtol: number, optional} @@ -6996,22 +7023,40 @@ def maximum( {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.max() + """ - return self.max( - axes=axes, - squeeze=squeeze, - mtol=mtol, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.max, + d, + axis=axes, + keepdims=not squeeze, split_every=split_every, - inplace=inplace, + mtol=mtol, ) + return d + @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) def maximum_absolute_value( @@ -7022,18 +7067,22 @@ def maximum_absolute_value( split_every=None, inplace=False, ): - """Collapse axes with their maximum absolute value. + """Calculate maximum absolute values. - Missing data elements are omitted from the calculation. + Calculates the maximum absolute value or the maximum absolute + values along axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sum`, `sd`, - `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `max`, `minimum_absolute_value` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} - squeeze : bool, optional + {{collapse squeeze: `bool`, optional}} {{mtol: number, optional} @@ -7043,27 +7092,28 @@ def maximum_absolute_value( {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[-99 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.maximum_absolute_value() - - >>> d.max() - - >>> d.maximum_absolute_value(axes=1) - - >>> d.max(axes=1) - + """ - # from .collapse_functions import cf_max_abs - d = _inplace_enabled_define_and_cleanup(self) d, _ = _collapse( Collapse.max_abs, @@ -7088,52 +7138,27 @@ def min( i=False, _preserve_partitions=False, ): - """TODO. - - :Parameters: + """Calculate minimum values. - {{mtol: number, optional} + Calculates the minimum value or the minimum values along axes. - {{split_every: `int` or `dict`, optional}} + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - .. versionadded:: TODODASK + ..seealso:: `max`, `minimum_absolute_value` - """ - # from .collapse_functions import cf_min - - d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( - Collapse.min, - d, - axis=axes, - keepdims=not squeeze, - split_every=split_every, - mtol=mtol, - ) - return d - - @_deprecated_kwarg_check("i") - def minimum( - self, - axes=None, - squeeze=False, - mtol=1, - split_every=None, - inplace=False, - i=False, - _preserve_partitions=False, - ): - """Alias. + :Parameters: - Collapse axes with their minimum. + {{collapse axes: (sequence of) `int`, optional}} - Missing data array elements are omitted from the calculation. + {{collapse squeeze: `bool`, optional}} - :Parameters: + {{mtol: number, optional} - axes : (sequence of) int, optional + {{split_every: `int` or `dict`, optional}} - squeeze : bool, optional + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -7142,20 +7167,33 @@ def minimum( :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - .. seealso:: `maximum`, `mean`, `mid_range`, `sum`, `sd`, `var` + **Examples** - **Examples:** + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.min() + """ - return self.min( - axes=axes, - squeeze=squeeze, - mtol=mtol, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.min, + d, + axis=axes, + keepdims=not squeeze, split_every=split_every, - inplace=inplace, + mtol=mtol, ) + return d @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @@ -7167,12 +7205,16 @@ def minimum_absolute_value( split_every=None, inplace=False, ): - """Collapse axes with their minimum absolute value. + """Calculate minimum absolute values. - Missing data elements are omitted from the calculation. + Calculates the minimum absolute value or the minimum absolute + values along axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sum`, `sd`, - `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `maximum_absolute_value`, `min` :Parameters: @@ -7188,27 +7230,29 @@ def minimum_absolute_value( {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[0, 0] = -99 + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[-99 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.minimum_absolute_value() - - >>> d.min() - - >>> d.minimum_absolute_value(axes=1) - - >>> d.min(axes=1) - + """ - # from .collapse_functions import cf_min_abs - d = _inplace_enabled_define_and_cleanup(self) d, _ = _collapse( Collapse.min_abs, @@ -7233,22 +7277,24 @@ def mean( split_every=None, i=False, ): - """Collapse axes with their mean. + """Calculate mean values. + + Calculates the mean value or the mean values along axes. - The mean is unweighted by default, but may be weighted (see the - *weights* parameter). + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - Missing data array elements and their corresponding weights - are omitted from the calculation. + ..seealso:: `mean_abslute_value`, `sd`, `sum` :Parameters: {{collapse axes: (sequence of) `int`, optional}} - {{collapse squeeze: `bool`, optional}} - {{weights: data_like, `dict`, or `None`, optional}} + {{collapse squeeze: `bool`, optional}} + {{mtol: number, optional} {{split_every: `int` or `dict`, optional}} @@ -7257,6 +7303,33 @@ def mean( {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.mean() + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.mean(weights=w) + + """ d = _inplace_enabled_define_and_cleanup(self) d, _ = _collapse( @@ -7280,23 +7353,26 @@ def mean_absolute_value( weights=None, split_every=None, inplace=False, - _preserve_partitions=False, ): - """Collapse axes with their mean absolute value. + """Calculate mean absolute values. - Missing data elements are omitted from the calculation. + Calculates the mean absolute value or the mean absolute values + along axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sum`, `sd`, - `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `mean`, `sd`, `sum` :Parameters: {{collapse axes: (sequence of) `int`, optional}} - {{collapse squeeze: `bool`, optional}} - {{weights: data_like, `dict`, or `None`, optional}} + {{collapse squeeze: `bool`, optional}} + {{mtol: number, optional} {{split_every: `int` or `dict`, optional}} @@ -7311,17 +7387,27 @@ def mean_absolute_value( The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[0, 0] = -99 + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[-99 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.mean_absolute_value() - - >>> d.mean_absolute_value(axes=1) - + - """ - # from .collapse_functions import cf_mean_abs + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.mean_absolute_value(weights=w) + + """ d = _inplace_enabled_define_and_cleanup(self) d, _ = _collapse( Collapse.mean_abs, @@ -7346,37 +7432,23 @@ def integral( inplace=False, _preserve_partitions=False, ): - """Collapse axes with their integral. + """Calculate summed values. + + Calculates the sum value or the sum values along axes. - If weights are not provided then all non-missing elements are - given weighting of one such that the collapse method becomes - a `sum`. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `mean`, `sd`, `sum` :Parameters: {{collapse axes: (sequence of) `int`, optional}} - {{collapse squeeze: `bool`, optional}} - {{weights: data_like, `dict`, or `None`, optional}} - Note that the units of the weights matter for an - integral collapse, which differs from a weighted sum - in that the units of the weights are incorporated into - the result. - - *Parameter example:* - If ``weights={1: w, (2, 0): x}`` then ``w`` must - contain 1-dimensional weights for axis 1 and ``x`` - must contain 2-dimensional weights for axes 2 and - 0. This is equivalent, for example, to - ``weights={(1, 2, 0), y}``, where ``y`` is the outer - product of ``w`` and ``x``. If ``axes=[1, 2, 0]`` - then ``weights={(1, 2, 0), y}`` is equivalent to - ``weights=y``. If ``axes=None`` and the array is - 3-dimensional then ``weights={(1, 2, 0), y}`` is - equivalent to ``weights=y.transpose([2, 0, 1])``. - + {{collapse squeeze: `bool`, optional}} {{mtol: number, optional} @@ -7386,20 +7458,37 @@ def integral( {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` - The collapsed data, or `None` of the operation was + The collapsed data, or `None` if the operation was in-place. - .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, - `var` + **Examples** - **Examples:** + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.integral() + - """ - # from .collapse_functions import cf_sum + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.integral(weights=w) + + + >>> d.integral(weights=cf.Data(w, 'm')) + + """ d = _inplace_enabled_define_and_cleanup(self) d, weights = _collapse( Collapse.sum, @@ -7438,19 +7527,49 @@ def sample_size( inplace=False, i=False, ): - """TODO. + """Calculate sample size values. + + The sample size is the number of non-missing values. + + Calculates the sample size value or the sample size values + along axes. :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + {{mtol: number, optional} {{split_every: `int` or `dict`, optional}} .. versionadded:: TODODASK + {{inplace: `bool`, optional}} + + {{i: deprecated at version 3.0.0}} + + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sample_size() + + """ - # from .collapse_functions import cf_sample_size - d = _inplace_enabled_define_and_cleanup(self) d, _ = _collapse( Collapse.sample_size, @@ -7460,6 +7579,8 @@ def sample_size( split_every=split_every, mtol=mtol, ) + d.override_units(_units_None, inplace=True) + return d @property @@ -7853,7 +7974,8 @@ def count(self): 8 """ - # TODODASK - daskify, previously parallelise=mpi_on (not =False) + # TODODASK - simply use da.ma.count (dask>=2022.3.1) + config = self.partition_configuration(readonly=True) n = 0 @@ -10022,8 +10144,6 @@ def mid_range( **Examples:** """ - # from .collapse_functions import cf_mid_range - d = _inplace_enabled_define_and_cleanup(self) d, _ = _collapse( Collapse.mid_range, @@ -10360,8 +10480,6 @@ def root_mean_square( **Examples:** """ - # from .collapse_functions import cf_rms - d = _inplace_enabled_define_and_cleanup(self) d, _ = _collapse( Collapse.rms, @@ -12001,8 +12119,6 @@ def range( **Examples:** """ - # from .collapse_functions import cf_range - d = _inplace_enabled_define_and_cleanup(self) d, _ = _collapse( Collapse.range, @@ -12078,16 +12194,60 @@ def sum( split_every=None, i=False, ): - """TODO. + """Calculate sum values. + + Calculates the sum value or the sum values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `integral`, `mean`, `sd`, `sum_of_squares`, + `sum_of_weights` :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + {{mtol: number, optional} {{split_every: `int` or `dict`, optional}} .. versionadded:: TODODASK + {{inplace: `bool`, optional}} + + {{i: deprecated at version 3.0.0}} + + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sum() + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum(weights=cf.Data(w, 'm')) + + """ d = _inplace_enabled_define_and_cleanup(self) d, _ = _collapse( @@ -12106,19 +12266,23 @@ def sum( def sum_of_squares( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, split_every=None, inplace=False, ): - """Collapse axes with the sum of the squares of the values. + """Calculate sums of squares. - Missing data array elements are omitted from the calculation. + Calculates the sum of squares or the sum of squares values + along axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `range`, - `sample_size`, `sd`, `sum_of_weights`, - `sum_of_weights2`, `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sum`, `sum_of_squares`, + `sum_of_weights2` :Parameters: @@ -12142,30 +12306,45 @@ def sum_of_squares( The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.sum_of_squares() - - >>> d.sum_of_squares(axes=1) - + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum_of_squares(weights=w) + """ d = _inplace_enabled_define_and_cleanup(self) - d, _ = _collapse( - Collapse.sum_of_squares, - d, - axis=axes, + # d, _ = _collapse( + # Collapse.sum_of_squares, + # d, + # axis=axes, + # weights=weights, + # keepdims=not squeeze, + # split_every=split_every, + # mtol=mtol, + # ) + d.square(inplace=True) + d.sum( + axes=axes, weights=weights, - keepdims=not squeeze, - split_every=split_every, + squeeze=squeeze, mtol=mtol, + split_every=split_every, + inplace=True, ) - - units = d.Units - if units: - d.override_units(units ** 2, inplace=True) - return d @daskified(_DASKIFIED_VERBOSE) @@ -12181,12 +12360,23 @@ def sum_of_weights( inplace=False, i=False, ): - """Collapse axes with the sum of weights. + """Calculate sums of weights. - Missing data array elements are omitted from the calculation. + Calculates the sum of weights or the sum of weights values + along axes. + + The weights given by the *weights* parameter are internally + broadcast to the shape of the data, and those weights that are + missing data, or that correspond to the missing elements of + the data, are assigned a weight of 0. It is these processed + weights that are summed. - .. seealso:: `maximum`, `mean`, `mid_range`, `minimum`, `range`, - `sample_size`, `sd`, `sum`, `sum_of_weights2`, `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sum`, `sum_of_squares`, + `sum_of_weights2` :Parameters: @@ -12209,13 +12399,32 @@ def sum_of_weights( :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. **Examples** - """ - # from .collapse_functions import cf_sum_of_weights + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sum_of_weights() + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum_of_weights(weights=w) + + >>> d.sum_of_weights(weights=cf.Data(w, 'm')) + + + """ d = _inplace_enabled_define_and_cleanup(self) d, weights = _collapse( Collapse.sum_of_weights, @@ -12250,12 +12459,23 @@ def sum_of_weights2( inplace=False, i=False, ): - """Collapse axes with the sum of squares of weights. + """Calculate sums of squares of weights. - Missing data array elements are omitted from the calculation. + Calculates the sum of squares of weights or the sum of squares + of weights values along axes. + + The weights given by the *weights* parameter are internally + broadcast to the shape of the data, and those weights that + are missing data, or that correspond to the missing elements + of the data, are assigned a weight of 0. It is these processed + weights that are squared and summed. - .. seealso:: `maximum`, `mean`, `mid_range`, `minimum`, `range`, - `sample_size`, `sd`, `sum`, `sum_of_weights`, `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sum`, `sum_of_squares`, + `sum_of_weights2` :Parameters: @@ -12271,20 +12491,39 @@ def sum_of_weights2( .. versionadded:: TODODASK - {{inplace: `bool`, optional}} + {[inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. **Examples** - """ - # from .collapse_functions import cf_sum_of_weights2 + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sum_of_weights2() + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum_of_weights2(weights=w) + + >>> d.sum_of_weights2(weights=cf.Data(w, 'm')) + + + """ d = _inplace_enabled_define_and_cleanup(self) d, weights = _collapse( Collapse.sum_of_weights2, @@ -12308,7 +12547,9 @@ def sum_of_weights2( return d + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") + @_inplace_enabled(default=False) def sd( self, axes=None, @@ -12319,104 +12560,25 @@ def sd( split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - r"""Collapse axes by calculating their standard deviation. - - The standard deviation may be adjusted for the number of degrees of - freedom and may be calculated with weighted values. - - Missing data array elements and those with zero weight are omitted - from the calculation. - - The unweighted standard deviation, :math:`s`, of :math:`N` values - :math:`x_i` with mean :math:`m` and with :math:`N-ddof` degrees of - freedom (:math:`ddof\ge0`) is: - - .. math:: s=\sqrt{\\frac{1}{N-ddof} \sum_{i=1}^{N} (x_i - m)^2} - - The weighted standard deviation, :math:`\\tilde{s}_N`, of :math:`N` - values :math:`x_i` with corresponding weights :math:`w_i`, weighted - mean :math:`\\tilde{m}` and with :math:`N` degrees of freedom is: - - .. math:: \\tilde{s}_N=\sqrt{\\frac{1}{\sum_{i=1}^{N} w_i} - \sum_{i=1}^{N} w_i(x_i - \\tilde{m})^2} - - The weighted standard deviation, :math:`\\tilde{s}`, of :math:`N` - values :math:`x_i` with corresponding weights :math:`w_i` and with - :math:`N-ddof` degrees of freedom (:math:`ddof>0`) is: - - .. math:: \\tilde{s} = \sqrt{\\frac{f \sum_{i=1}^{N} w_i}{f - \sum_{i=1}^{N} w_i - ddof}} \\tilde{s}_N - - where :math:`f` is the smallest positive number whose product with - each weight is an integer. :math:`f \sum_{i=1}^{N} w_i` is the - size of a new sample created by each :math:`x_i` having - :math:`fw_i` repeats. In practice, :math:`f` may not exist or may - be difficult to calculate, so :math:`f` is either set to a - predetermined value or an approximate value is calculated. The - approximation is the smallest positive number whose products with - the smallest and largest weights and the sum of the weights are - all integers, where a positive number is considered to be an - integer if its decimal part is sufficiently small (no greater than - :math:`10^{-8}` plus :math:`10^{-5}` times its integer part). This - approximation will never overestimate :math:`f`, so - :math:`\\tilde{s}` will never be underestimated when the - approximation is used. If the weights are all integers which are - collectively coprime then setting :math:`f=1` will guarantee that - :math:`\\tilde{s}` is exact. + r"""Calculate standard deviations. - :Parameters: + Calculates the standard deviation of an array or the standard + deviations along axes. - axes : (sequence of) `int`, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - squeeze : `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1. When the collapsed axes are retained, the - result is guaranteed to broadcast correctly against the - original array. + ..seealso:: `mean`, `sum`, `var` - *Parameter example:* - Suppose that an array, ``d``, has shape (2, 3, 4) and - ``e = d.sd(axis=1)``. Then ``e`` has shape (2, 1, 4) - and, for example, ``d/e`` is allowed. If ``e = - d.sd(axis=1, squeeze=True)`` then ``e`` will have shape - (2, 4) and ``d/e`` is an illegal operation. - - weights : data-like or `dict`, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - equal weights of 1. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values it may be used in - conjunction with any value of *axes*, because the axes to - which the weights apply are given explicitly. + :Parameters: - *Parameter example:* - Suppose that the original array being collapsed has - shape (2, 3, 4) and *weights* is set to a data-like - object, ``w``. If ``axes=None`` then ``w`` must have - shape (2, 3, 4). If ``axes=(0, 1, 2)`` then ``w`` must - have shape (2, 3, 4). If ``axes=(2, 0, 1)`` then ``w`` - must either have shape (2, 3, 4) or else (4, 2, 3). If - ``axes=1`` then ``w`` must either have shape (2, 3, 4) - or else (3,). If ``axes=(2, 0)`` then ``w`` must either - have shape (2, 3, 4) or else (4, 2). Suppose *weights* - is a dictionary. If ``weights={1: x}`` then ``x`` must - have shape (3,). If ``weights={1: x, (2, 0): y}`` then - ``x`` must have shape (3,) and ``y`` must have shape (4, - 2). The last example is equivalent to ``weights={(1, 2, - 0): x.outerproduct(y)}`` (see `outerproduct` for - details). + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} {{mtol: number, optional} @@ -12433,23 +12595,29 @@ def sd( :Returns: `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** - >>> d = cf.Data([1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4]) - >>> e = cf.Data([1, 2, 3, 4]) - >>> d.sd(squeeze=False) - - >>> d.sd() - - >>> e.sd(weights=[2, 3, 5, 6]) - - >>> e.sd(weights=[2, 3, 5, 6], f=1) - + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.sd(ddof=0) - - >>> e.sd(ddof=0, weights=[2, 3, 5, 6]) - + + >>> d.sd(ddof=1) + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sd(ddof=1, weights=w) + """ d = _inplace_enabled_define_and_cleanup(self) @@ -12462,7 +12630,8 @@ def sd( split_every=split_every, inplace=True, ) - return d ** 0.5 # TODODASK: replace with sqrt + d.sqrt(inplace=True) + return d @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @@ -12477,12 +12646,26 @@ def var( inplace=False, split_every=None, i=False, - _preserve_partitions=False, ): - """TODO. + """Calculate variances. + + Calculates the variance of an array or the variance values + along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `mean`, `sd`, `sum` :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + {{mtol: number, optional} {{ddof: number}} @@ -12491,6 +12674,37 @@ def var( .. versionadded:: TODODASK + {{inplace: `bool`, optional}} + + {{i: deprecated at version 3.0.0}} + + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.var(ddof=0) + + >>> d.var(ddof=1) + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.var(ddof=1, weights=w) + + """ if ddof is None: raise ValueError("Must set the delta degrees of freedom (ddof)") @@ -12570,6 +12784,122 @@ def section( self, axes, data=True, stop=stop, chunks=chunks, min_step=min_step ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + def square(self, dtype=None, inplace=False): + """Calculate the non-negative square root. + + .. versionadded:: TODODASK + + .. seealso:: `sqrt`, `sum_of_squares` + + :Parameters: + + dtype: data-type, optional + Overrides the data type of the output arrays. A + matching precision of the calculation should be + chosen. For example, a *dtype* of ``'int32'`` is only + allowed when the input values are integers. + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The element-wise positive square root of the data + collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> d = cf.Data([[0, 1, 2.5, 3, 4]], 'K', mask=[[0, 0, 0, 1, 0]]) + >>> e = d.square() + >>> e + + >>> print(e.array) + [[0.0 1.0 6.25 -- 16.0]] + + """ + d = _inplace_enabled_define_and_cleanup(self) + dx = d.to_dask_array() + dx = da.square(dx, dtype=dtype) + d._set_dask(dx, reset_mask_hardness=False) + + units = d.Units + if units: + d.override_units(units ** 2, inplace=True) + + return d + + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + def sqrt(self, dtype=None, inplace=False): + """Calculate the non-negative square root. + + .. versionadded:: TODODASK + + .. seealso:: `square` + + :Parameters: + + dtype: data-type, optional + Overrides the data type of the output arrays. A + matching precision of the calculation should be + chosen. For example, a *dtype* of ``'int32'` is not + allowed, even if the input values are perfect squares. + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The element-wise positive square root of the data + collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> d = cf.Data([[0, 1, 2, 3, 4]], 'K2', mask=[[0, 0, 0, 1, 0]]) + >>> e = d.sqrt() + >>> e + + >>> print(e.array) + [[0.0 1.0 1.4142135623730951 -- 2.0]] + + Negative values raise a warning but result in either NaN or, + if the there are already missing values, missing data: + + >>> import warnings + >>> d = cf.Data([0, 1, -4]) + >>> print(d.array) + [ 0 1 -4] + >>> with warnings.catch_warnings(): + ... warnings.simplefilter("ignore") + ... print(d.sqrt().array) + ... + [ 0. 1. nan] + + >>> d = cf.Data([0, 1, -4], mask=[1, 0, 0]) + >>> print(d.array) + [-- 1 -4] + >>> with warnings.catch_warnings(): + ... warnings.simplefilter("ignore") + ... print(d.sqrt().array) + ... + [-- 1.0 --] + + """ + d = _inplace_enabled_define_and_cleanup(self) + dx = d.to_dask_array() + dx = da.sqrt(dx, dtype=dtype) + d._set_dask(dx, reset_mask_hardness=False) + + units = d.Units + if units: + d.override_units(units ** 0.5, inplace=True) + + return d + # ---------------------------------------------------------------- # Alias # ---------------------------------------------------------------- @@ -12578,6 +12908,50 @@ def dtarray(self): """Alias for `datetime_array`""" return self.datetime_array + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def maximum( + self, + axes=None, + squeeze=False, + mtol=1, + split_every=None, + inplace=False, + i=False, + ): + """Alias for `max`""" + return self.max( + axes=axes, + squeeze=squeeze, + mtol=mtol, + split_every=split_every, + inplace=inplace, + i=i, + ) + + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def minimum( + self, + axes=None, + squeeze=False, + mtol=1, + split_every=None, + inplace=False, + i=False, + ): + """Alias for `min`""" + return self.min( + axes=axes, + squeeze=squeeze, + mtol=mtol, + split_every=split_every, + inplace=inplace, + i=i, + ) + def standard_deviation( self, axes=None, @@ -12856,11 +13230,11 @@ def _collapse( ddof=None, split_every=None, ): - """Collapse data using a callable *func*. + """Collapse data using a given funcion. .. versionadded:: TODODASK - .. seealso:: `_format_weights` + .. seealso:: `_parse_weights` :Parameters: @@ -12884,26 +13258,32 @@ def _collapse( data. weights: data_like, `dict`, or `None`, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - a weight equal to one. + + Weights associated with values of the data. By default + *weights* is `None`, meaning that all non-missing + elements of the data have a weight of 1 and all + missing elements have a weight of 0. If *weights* is a data_like object then it must be - broadcastable to the array or, if that is not the case, - the same shape as the axes being collapsed. TODODASK - - scrib the last possibility? - - If *weights* is a dictionary then each key specifies axes - of the array (an `int` or `tuple` of `int`), with a - corresponding value of data_like weights for those - axes. The dimensions of a weights value must correspond to - its key axes in the same order. The weights that will be - used in the collapse will an outer product of the + broadcastable to the array. + + If *weights* is a dictionary then each key specifies + axes of the data (an `int` or `tuple` of `int`), with + a corresponding value of data_like weights for those + axes. The dimensions of a weights value must + correspond to its key axes in the same order. Not all + of the axes need weights assigned to them. The weights + that will be used will be an outer product of the dictionary's values. - For collapses that do need need weights (such as a - maximum), *weights * must be `None` and *func* need not - support a ``weights`` parameter. + However they are specified, the weights are internally + broadcast to the shape of the data, and those weights + that are missing data, or that correspond to the + missing elements of the data, are assigned a weight of + 0. + + For collapse functions that do not have a ``weights`` + parameter, *weights * must be `None`. keepdims: `bool`, optional By default, the axes which are collapsed are left in the @@ -12935,7 +13315,7 @@ def _collapse( :Returns: `Data`, formatted weights - The collapsed data and the output of ``_format_weights(d, + The collapsed data and the output of ``_parse_weights(d, weights, axis)``. """ @@ -12946,9 +13326,8 @@ def _collapse( "mtol": mtol, } - weights = _format_weights(d, weights, axis) if weights is not None: - kwargs["weights"] = weights + kwargs["weights"] = _parse_weights(d, weights, axis) if ddof is not None: kwargs["ddof"] = ddof @@ -12960,39 +13339,66 @@ def _collapse( return d, weights -def _format_weights(d, weights, axis=None): - """TODODASK. +def _parse_weights(d, weights, axis=None): + """Parse the weights input to `_collapse`. - :Returns: + :Parameters: + + d: `Data` + The data to be collapsed. + + weights: data_like or `dict` + See `_collapse` for details. + axis: (sequence of) `int`, optional + See `_collapse` for details. + + :Returns: + `Data` or `None` + * If *weights* is a data_like object then they are + returned unchanged as a `Data` object. It is up to the + downstream functions to check the weights + broadcastability. + + * If *weights* is a dictionary then the dictionary + values', i.e. the weights components, outer product is + broadcast to the data and returned as a `Data` object. + + If the dictionary is empty, or none of the axes defined + by the keys correspond to collapse axes defined by + *axis*, then then the collapse is unweighted and `None` + is returned. + + Note that, in all cases, the returned weights are *not* + modified to account for missing values in the data. **Examples** >>> d = cf.Data(np.arange(12)).reshape(4, 3) - >>> print(cf.data.data._format_weights(d, None)) - None - - >>> cf.data.data._format_weights(d, [1, 2, 1], (0, 1)) + >>> _parse_weights(d, [1, 2, 1], (0, 1)) - >>> cf.data.data._format_weights(d, [[1, 2, 1]], (0, 1)) + >>> _parse_weights(d, [[1, 2, 1]], (0, 1)) - >>> cf.data.data._format_weights(d, {1: [1, 2, 1]}, (0, 1)) + >>> _parse_weights(d, {1: [1, 2, 1]}, (0, 1)) - >>> cf.data.data._format_weights( - ... d, {0: [1, 2, 3, 4], 1: [1, 2, 1}, (0, 1) - ... ) - TODODASK (need __mul__ to be daskified) + >>> print(_parse_weights(d, {0: [1, 2, 3, 4], 1: [1, 2, 1]}, (0, 1))) + [[1 2 1] + [2 4 2] + [3 6 3] + [4 8 4]] - """ - if weights is None: - # No weights - return + >>> print(cf.data.data._parse_weights(d, {}, (0, 1))) + None + + >>> print(cf.data.data._parse_weights(d, {1: [1, 2, 1]}, 0)) + None + """ if not isinstance(weights, dict): # Weights is data_like. Don't check broadcastability to d, # leave that to whatever uses the weights. diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 5cb11f12b3..4624afba6e 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -270,23 +270,28 @@ from the data.""", # weights "{{weights: data_like, `dict`, or `None`, optional}}": """weights: data_like, `dict`, or `None`, optional - Weights associated with values of the array. By - default *weights* is `None`, meaning that all - non-missing elements of the array are assumed to have - a weight equal to one. + Weights associated with values of the data. By default + *weights* is `None`, meaning that all non-missing + elements of the data have a weight of 1 and all + missing elements have a weight of 0. If *weights* is a data_like object then it must be - broadcastable to the array or, if that is not the - case, the same shape as the axes being - collapsed. TODODASK - scrib the last possibility? + broadcastable to the array. If *weights* is a dictionary then each key specifies - axes of the array (an `int` or `tuple` of `int`), with + axes of the data (an `int` or `tuple` of `int`), with a corresponding value of data_like weights for those axes. The dimensions of a weights value must - correspond to its key axes in the same order. The - weights that will be used in the collapse will an - outer product of the dictionary's values.""", + correspond to its key axes in the same order. Not all + of the axes need weights assigned to them. The weights + that will be used will be an outer product of the + dictionary's values. + + However they are specified, the weights are internally + broadcast to the shape of the data, and those weights + that are missing data, or that correspond to the + missing elements of the data, are assigned a weight of + 0.""", # collapse mtol "{{mtol: number, optional}}": """mtol: number, optional The sample size threshold below which collapsed values @@ -307,10 +312,12 @@ of missing data in the input data.""", # ddof "{{ddof: number}}": """ddof: number - The delta degrees of freedom. The number of degrees of - freedom used in the calculation is (N-*ddof*) where N - represents the number of non-missing elements. A value - of 1 applies Bessel's correction.""", + The delta degrees of freedom, a non-negative + number. The number of degrees of freedom used in the + calculation is (N-*ddof*) where N represents the + number of non-missing elements. A value of 1 applies + Bessel's correction. If the calculation is weighted + then *ddof* can only be 0 or 1.""", # split_every "{{split_every: `int` or `dict`, optional}}": """split_every: `int` or `dict`, optional Determines the depth of the recursive aggregation. If diff --git a/cf/functions.py b/cf/functions.py index 23f146ee18..f8bd234ef1 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -43,7 +43,7 @@ from numpy.ma import take as _numpy_ma_take from psutil import Process, virtual_memory -from . import __file__, __version__, mpi_size +from . import __file__, __version__ from .constants import ( CONSTANTS, OperandBoundsCombination, @@ -810,7 +810,7 @@ class chunksize(ConstantAccess): The upper limit to the chunksize is given by: - .. math:: upper\_chunksize = \dfrac{f \cdot total\_memory}{mpi\_size + .. math:: upper\_chunksize = \dfrac{f \cdot total\_memory}{1 \cdot w_1 + w_2} where :math:`f` is the *free memory factor* and :math:`w_1` and @@ -851,16 +851,11 @@ def _parse(cls, arg): """ upper_chunksize = (free_memory_factor() * min_total_memory()) / ( - (mpi_size * _WORKSPACE_FACTOR_1()) + _WORKSPACE_FACTOR_2() + (_WORKSPACE_FACTOR_1()) + _WORKSPACE_FACTOR_2() ) arg = float(arg) - if arg > upper_chunksize and mpi_size > 1: - raise ValueError( - f"Specified chunk size ({arg}) is too large for the given " - f"free memory factor ({upper_chunksize})" - ) - elif arg <= 0: + if arg <= 0: raise ValueError(f"Chunk size ({arg}) must be positive") return arg diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 35f88762de..35c725e209 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -60,6 +60,7 @@ def reshape_array(a, axes): b = b.reshape(new_shape) return b + def axis_combinations(a): return [ axes @@ -67,14 +68,15 @@ def axis_combinations(a): for axes in itertools.combinations(range(a.ndim), n) ] + class DataTest(unittest.TestCase): axes_combinations = axis_combinations(a) - #[ + # [ # axes # for n in range(1, a.ndim + 1) # for axes in itertools.combinations(range(a.ndim), n) - #] + # ] filename = os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_file.nc" @@ -2647,641 +2649,6 @@ def test_Data_percentile_median(self): with self.assertRaises(ValueError): d.percentile(q).array - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") - def test_Data_mean_of_upper_decile(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - p = np.percentile(b, 90, axis=-1, keepdims=True) - b = np.ma.where(b < p, np.ma.masked, b) - b = np.average(b, axis=-1) - - e = d.mean_of_upper_decile( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "mean_of_upper_decile, axis={}, unweighted, " - "unmasked \ne={}, \nb={}".format(axes, e.array, b), - ) - - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - b = np.ma.filled(b, np.nan) - with np.testing.suppress_warnings() as sup: - sup.filter( - RuntimeWarning, message=".*All-NaN slice encountered" - ) - p = np.nanpercentile(b, 90, axis=-1, keepdims=True) - - b = np.ma.masked_where(np.isnan(b), b, copy=False) - - p = np.where(np.isnan(p), b.max() + 1, p) - - with np.testing.suppress_warnings() as sup: - sup.filter( - RuntimeWarning, - message=".*invalid value encountered in less", - ) - b = np.ma.where(b < p, np.ma.masked, b) - - b = np.ma.average(b, axis=-1) - b = np.ma.asanyarray(b) - - e = d.mean_of_upper_decile( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "mean_of_upper_decile, axis={}, \ne.mask={}, " - "\nb.mask={}".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "mean_of_upper_decile, axis={}, " - "unweighted, masked " - "\ne={}, \nb={}".format(axes, e.array, b), - ) - - def test_Data_range_mid_range(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - msg = None - - # unweighted, unmasked - d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) - for h in ("range", "mid_range"): - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - mn = np.amin(b, axis=-1) - mx = np.amax(b, axis=-1) - if h == "range": - b = mx - mn - elif h == "mid_range": - b = (mx + mn) * 0.5 - - e = getattr(d, h)(axes=axes, squeeze=True) - - # For debugging - # msg = (f"{h}, axis={axes}, unweighted, unmasked " - # f"\ne={e.array}, \nb={b}") - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) - - # unweighted, masked - d = cf.Data(self.ma, chunks=(2, 3, 2, 5)) - for h in ("range", "mid_range"): - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - mn = np.amin(b, axis=-1) - mx = np.amax(b, axis=-1) - if h == "range": - b = mx - mn - elif h == "mid_range": - b = (mx + mn) * 0.5 - - b = np.ma.asanyarray(b) - - e = getattr(d, h)(axes=axes, squeeze=True) - - # For debugging - # msg = (f"{h}, axis={axes}, \ne.mask={e.mask.array}, " - # f "\nb.mask={b.mask}") - - self.assertTrue((e.mask.array == b.mask).all(), msg) - - # For debugging - # msg = (f"{h}, axis={axes}, unweighted, masked " - # f"\ne={e.array}, \nb={b}") - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) - - def test_Data_integral(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - msg = None - - # unmasked - d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) - x = cf.Data(self.w, "kg") - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - v = reshape_array(self.w, axes) - b = np.sum(b * v, axis=-1) - - e = d.integral(axes=axes, squeeze=True, weights=x) - self.assertTrue(e.Units, cf.Units("m kg")) - - # For debugging - # msg = f"axis={axes}, masked \ne={e.array}, \nb={b}" - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) - - # masked - d = cf.Data(self.ma, "m", chunks=(2, 3, 2, 5)) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - v = reshape_array(self.w, axes) - b = np.sum(b * v, axis=-1) - b = np.ma.asanyarray(b) - - e = d.integral(axes=axes, squeeze=True, weights=x) - self.assertTrue(e.Units, cf.Units("m kg")) - - # For debugging - # msg = f"axis={axes}, masked \ne={e.mask.array}, \nb={b}" - - self.assertTrue((e.mask.array == b.mask).all(), msg) - - # For debugging - # msg = f"axis={axes}, masked \ne={e.array}, \nb={b}" - - self.assertTrue( - np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_sum_of_weights_sum_of_weights2(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for h in ("sum_of_weights", "sum_of_weights2"): - for axes in self.axes_combinations: - b = reshape_array(self.ones, axes) - b = b.sum(axis=-1) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked, pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for a, h in zip( - (self.mones, self.mones), ("sum_of_weights", "sum_of_weights2") - ): - for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = np.ma.asanyarray(b.sum(axis=-1)) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne.mask={}, \nb.mask={}".format( - h, axes, pp, e.mask.array, b.mask - ), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # weighted, masked - d = cf.Data(self.ma) - x = cf.Data(self.w) - for a, h in zip( - (self.mw, self.mw * self.mw), - ("sum_of_weights", "sum_of_weights2"), - ): - for axes in self.axes_combinations: - a = a.copy() - a.mask = self.ma.mask - b = reshape_array(a, axes) - b = np.ma.asanyarray(b.sum(axis=-1)) - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - _preserve_partitions=pp, - ) - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}".format(h, axes, e.mask.array, b.mask), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, \ne={}, \nb={}".format( - h, axes, e.array, b - ), - ) - # --- End: for - - # weighted, unmasked - d = cf.Data(self.a) - for a, h in zip( - (self.w, self.w * self.w), - ("sum_of_weights", "sum_of_weights2"), - ): - for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = b.sum(axis=-1) - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, \ne={}, \nb={}".format( - h, axes, e.array, b - ), - ) - -# def test_Data_sum_mean_mean_absolute_value(self): -# if self.test_only and inspect.stack()[0][3] not in self.test_only: -# return -# -# msg = None -# -# for absolute in (False, True): -# a = self.a -# ma = self.ma -# method = "mean" -# if absolute: -# a = np.absolute(a) -# ma = np.absolute(ma) -# method = "mean_absolute_value" -# -# # unweighted, unmasked -# d = cf.Data(self.a, "m", chunks=(2, 3, 2, 5)) -# for axes in self.axes_combinations: -# b = reshape_array(a, axes) -# b = np.mean(b, axis=-1) -# e = getattr(d, method)(axes=axes, squeeze=True) -# -# # For debugging -# # msg = (f"{method} unweighted, unmasked, axis={axes}, " -# # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") -# -# self.assertTrue( -# np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg -# ) -# -# # weighted, unmasked -# x = cf.Data(self.w) -# for axes in self.axes_combinations: -# b = reshape_array(a, axes) -# v = reshape_array(self.w, axes) -# b = np.average(b, axis=-1, weights=v) -# -# e = getattr(d, method)(axes=axes, weights=x, squeeze=True) -# -# # For debugging -# # msg = (f"{method} weighted, unmasked, axis={axes}, " -# # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") -# -# self.assertTrue( -# np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg -# ) -# -# # unweighted, masked -# d = cf.Data(self.ma, "m", chunks=(2, 3, 2, 5)) -# for axes in self.axes_combinations: -# b = reshape_array(ma, axes) -# b = np.ma.average(b, axis=-1) -# b = np.ma.asanyarray(b) -# -# e = getattr(d, method)(axes=axes, squeeze=True) -# -# # For debugging -# # msg = (f"{method} unweighted, masked, axis={axes}, " -# # f"\ne.mask={e.mask.array}, \nb={b}") -# -# self.assertTrue((e.mask.array == b.mask).all(), msg) -# -# # For debugging -# # msg = (f"{method} unweighted, masked, axis={axes}, " -# # f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}") -# -# self.assertTrue( -# np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg -# ) -# -# # weighted, masked -# for axes in self.axes_combinations: -# print(axes) -# b = reshape_array(ma, axes) -# v = reshape_array(self.mw, axes) -# b = np.ma.average(b, axis=-1, weights=v) -# b = np.ma.asanyarray(b) -# -# e = getattr(d, method)(axes=axes, weights=x, squeeze=True) -# -# # For debugging -# # msg = (f"{method} weighted, masked, axis={axes}, " -# # f"\ne.mask={e.mask.array}, \nb={b}") -# -# self.assertTrue((e.mask.array == b.mask).all(), msg) -# -# # For debugging -# msg = ( -# f"{method} weighted, masked, axis={axes}, " -# f"\ne={e.array}, \nb={b}, \ndiff={e.array-b}" -# ) -# -# self.assertTrue( -# np.allclose(e.array, b, rtol=1e-05, atol=1e-08), msg -# ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_root_mean_square(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - # unweighted, unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) ** 2 - b = np.mean(b, axis=-1) ** 0.5 - e = d.root_mean_square(axes=axes, squeeze=True) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unweighted, unmasked \ne={}, " - "\nb={}".format(axes, e.array, b), - ) - # --- End: for - - # weighted, unmasked - x = cf.Data(self.w) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) ** 2 - v = reshape_array(self.w, axes) - b = np.average(b, axis=-1, weights=v) ** 0.5 - - e = d.root_mean_square(axes=axes, weights=x, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, weighted, unmasked \ne={}, " - "\nb={}".format(axes, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) ** 2 - b = np.ma.average(b, axis=-1) - b = np.ma.asanyarray(b) ** 0.5 - - e = d.root_mean_square(axes=axes, squeeze=True) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={}, unweighted, masked \ne.mask={}, " - "\nb.mask={}, ".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unweighted, masked \ne={}, " - "\nb={}, ".format(axes, e.array, b), - ) - # --- End: for - - # weighted, masked - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) ** 2 - v = reshape_array(self.mw, axes) - b = np.ma.average(b, axis=-1, weights=v) - b = np.ma.asanyarray(b) ** 0.5 - - e = d.root_mean_square(axes=axes, weights=x, squeeze=True) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={}, weighted, masked \ne.mask={}, " - "\nb.mask={}, ".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, weighted, masked \ne={}, \nb={}, ".format( - axes, e.array, b - ), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_sample_size(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - # unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.ones, axes) - b = b.sum(axis=-1) - e = d.sample_size(axes=axes, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, \ne={}, \nb={}".format(axes, e.array, b), - ) - # --- End: for - - # masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.mones, axes) - b = b.sum(axis=-1) - e = d.sample_size(axes=axes, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, \ne={}, \nb={}".format(axes, e.array, b), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'axes_combinations'") - def test_Data_sd_var(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - ddofs = (0, 1) - - for pp in (False, True): - # unweighted, unmasked - d = cf.Data(self.a, units="K") - for _np, h in zip((np.var, np.std), ("var", "sd")): - for ddof in ddofs: - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - b = _np(b, axis=-1, ddof=ddof) - e = getattr(d, h)( - axes=axes, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma, units="K") - for _np, h in zip((np.ma.var, np.ma.std), ("var", "sd")): - for ddof in ddofs: - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - b = _np(b, axis=-1, ddof=ddof) - e = getattr(d, h)( - axes=axes, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # weighted, unmasked - d = cf.Data(self.a, units="K") - x = cf.Data(self.w) - for h in ("var", "sd"): - for axes in self.axes_combinations: - for ddof in (0, 1): - b = reshape_array(self.a, axes) - v = reshape_array(self.w, axes) - - avg = np.average(b, axis=-1, weights=v) - if np.ndim(avg) < b.ndim: - avg = np.expand_dims(avg, -1) - - b, V1 = np.average( - (b - avg) ** 2, axis=-1, weights=v, returned=True - ) - - if ddof == 1: - # Calculate the weighted unbiased - # variance. The unbiased variance - # weighted with _reliability_ weights - # is [V1**2/(V1**2-V2)]*var. - V2 = np.asanyarray((v * v).sum(axis=-1)) - b *= V1 * V1 / (V1 * V1 - V2) - elif ddof == 0: - pass - - if h == "sd": - b **= 0.5 - - b = np.ma.asanyarray(b) - - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, weighted, unmasked, pp={}, " - "ddof={}, \ne={}, \nb={}".format( - h, axes, pp, ddof, e.array, b - ), - ) - # --- End: for - - # weighted, masked - d = cf.Data(self.ma, units="K") - x = cf.Data(self.w) - for h in ("var", "sd"): - for axes in self.axes_combinations: - for ddof in (0, 1): - b = reshape_array(self.ma, axes) - v = reshape_array(self.mw, axes) - - not_enough_data = np.ma.count(b, axis=-1) <= ddof - - avg = np.ma.average(b, axis=-1, weights=v) - if np.ndim(avg) < b.ndim: - avg = np.expand_dims(avg, -1) - - b, V1 = np.ma.average( - (b - avg) ** 2, axis=-1, weights=v, returned=True - ) - - b = np.ma.where(not_enough_data, np.ma.masked, b) - - if ddof == 1: - # Calculate the weighted unbiased - # variance. The unbiased variance - # weighted with _reliability_ weights - # is [V1**2/(V1**2-V2)]*var. - V2 = np.asanyarray((v * v).sum(axis=-1)) - b *= V1 * V1 / (V1 * V1 - V2) - elif ddof == 0: - pass - - if h == "sd": - b **= 0.5 - - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - - if h == "sd": - self.assertEqual(e.Units, d.Units) - else: - self.assertEqual(e.Units, d.Units ** 2) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}, ".format( - h, axes, e.mask.array, b.mask - ), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, weighted, masked, pp={}, " - "ddof={}, \ne={}, \nb={}".format( - h, axes, pp, ddof, e.array, b - ), - ) - # --- End: for - @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits unexpected kwarg 'select'") def test_Data_dumpd_loadd_dumps(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: @@ -3772,12 +3139,131 @@ def test_Data_change_calendar(self): # calendar). with self.assertRaises(ValueError): e = d.change_calendar("noleap").array - + + def test_Data_reshape(self): + a = self.ma + d = cf.Data(a) + + self.assertIsNone(d.reshape(*d.shape, inplace=True)) + self.assertEqual(d.shape, a.shape) + + for original_shape, new_shape, chunks in ( + ((10,), (10,), (3, 3, 4)), + ((10,), (10, 1, 1), 5), + ((10,), (1, 10), 5), + ((24,), (2, 3, 4), 12), + ((1, 24), (2, 3, 4), 12), + ((2, 3, 4), (24,), (1, 3, 4)), + ((2, 3, 4), (24,), 4), + ((2, 3, 4), (24, 1), 4), + ((2, 3, 4), (1, 24), 4), + ((4, 4, 1), (4, 4), 2), + ((4, 4), (4, 4, 1), 2), + ((1, 4, 4), (4, 4), 2), + ((1, 4, 4), (4, 4, 1), 2), + ((1, 4, 4), (1, 1, 4, 4), 2), + ((4, 4), (1, 4, 4, 1), 2), + ((4, 4), (1, 4, 4), 2), + ((2, 3), (2, 3), (1, 2)), + ((2, 3), (3, 2), 3), + ((4, 2, 3), (4, 6), 4), + ((3, 4, 5, 6), (3, 4, 5, 6), (2, 3, 4, 5)), + ((), (1,), 1), + ((1,), (), 1), + ((24,), (3, 8), 24), + ((24,), (4, 6), 6), + ((24,), (4, 3, 2), 6), + ((24,), (4, 6, 1), 6), + ((24,), (4, 6), (6, 12, 6)), + ((64, 4), (8, 8, 4), (16, 2)), + ((4, 64), (4, 8, 4, 2), (2, 16)), + ((4, 8, 4, 2), (2, 1, 2, 32, 2), (2, 4, 2, 2)), + ((4, 1, 4), (4, 4), (2, 1, 2)), + ((0, 10), (0, 5, 2), (5, 5)), + ((5, 0, 2), (0, 10), (5, 2, 2)), + ((0,), (2, 0, 2), (4,)), + ((2, 0, 2), (0,), (4, 4, 4)), + ((2, 3, 4), -1, -1), + ): + a = np.random.randint(10, size=original_shape) + d = cf.Data(a, chunks=chunks) + + a = a.reshape(new_shape) + d = d.reshape(new_shape) + + self.assertEqual(d.shape, a.shape) + self.assertTrue((d.array == a).all()) + + def test_Data_square(self): + a = self.ma.astype(float) + asquare = np.square(a) + + d = cf.Data(a) + self.assertIsNone(d.square(inplace=True)) + self.assertTrue((d.array == asquare).all()) + self.assertEqual(d.Units, cf.Units()) + + d = cf.Data(a, "m") + e = d.square() + self.assertEqual(e.dtype, asquare.dtype) + self.assertTrue((e.array == asquare).all()) + self.assertEqual(e.Units, cf.Units("m2")) + + asquare = np.square(a, dtype="float32") + e = d.square(dtype="float32") + self.assertEqual(e.dtype, asquare.dtype) + self.assertTrue((e.array == asquare).all()) + + def test_Data_sqrt(self): + a = self.ma.astype(float) + asqrt = np.sqrt(a) + + d = cf.Data(a) + self.assertIsNone(d.sqrt(inplace=True)) + self.assertTrue((d.array == asqrt).all()) + self.assertEqual(d.Units, cf.Units()) + + d = cf.Data(a, "m2") + e = d.sqrt() + self.assertEqual(e.dtype, asqrt.dtype) + self.assertTrue((e.array == asqrt).all()) + self.assertEqual(e.Units, cf.Units("m")) + + asqrt = np.sqrt(a, dtype="float32") + e = d.sqrt(dtype="float32") + self.assertEqual(e.dtype, asqrt.dtype) + self.assertTrue((e.array == asqrt).all()) + + def test_Data_integral(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.sum(b * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.integral(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + # Check units + e = d.integral(weights=weights) + self.assertEqual(e.Units, cf.Units("K")) + + e = d.integral(weights=cf.Data(weights, "m")) + self.assertEqual(e.Units, cf.Units("K m")) + def test_Data_max(self): # Masked array a = self.ma - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) b = np.max(b, axis=-1) @@ -3785,15 +3271,18 @@ def test_Data_max(self): e = d.max(axes=axis, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - + + # Check units + self.assertEqual(d.range().Units, cf.Units("K")) + def test_Data_maximum_absolute_value(self): # Masked array a = self.ma - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) b = np.max(abs(b), axis=-1) @@ -3801,16 +3290,19 @@ def test_Data_maximum_absolute_value(self): e = d.maximum_absolute_value(axes=axis, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - + + # Check units + self.assertEqual(d.range().Units, cf.Units("K")) + def test_Data_mean(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) w = reshape_array(weights, axis) @@ -3819,16 +3311,19 @@ def test_Data_mean(self): e = d.mean(axes=axis, weights=weights, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - + + # Check units + self.assertEqual(d.range().Units, cf.Units("K")) + def test_Data_mean_absolute_value(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) w = reshape_array(weights, axis) @@ -3837,15 +3332,18 @@ def test_Data_mean_absolute_value(self): e = d.mean_absolute_value(axes=axis, weights=weights, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - + + # Check units + self.assertEqual(d.range().Units, cf.Units("K")) + def test_Data_mid_range(self): # Masked array, non-masked weights a = self.ma - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) b = (np.max(b, axis=-1) + np.min(b, axis=-1)) / 2.0 @@ -3853,15 +3351,18 @@ def test_Data_mid_range(self): e = d.mid_range(axes=axis, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - + + # Check units + self.assertEqual(d.range().Units, cf.Units("K")) + def test_Data_min(self): # Masked array a = self.ma - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) b = np.min(b, axis=-1) @@ -3869,15 +3370,18 @@ def test_Data_min(self): e = d.min(axes=axis, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - + + # Check units + self.assertEqual(d.range().Units, cf.Units("K")) + def test_Data_minimum_absolute_value(self): # Masked array a = self.ma - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) b = np.min(abs(b), axis=-1) @@ -3885,15 +3389,18 @@ def test_Data_minimum_absolute_value(self): e = d.minimum_absolute_value(axes=axis, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - + + # Check units + self.assertEqual(d.range().Units, cf.Units("K")) + def test_Data_range(self): # Masked array a = self.ma - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) b = np.max(b, axis=-1) - np.min(b, axis=-1) @@ -3901,16 +3408,19 @@ def test_Data_range(self): e = d.range(axes=axis, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - + + # Check units + self.assertEqual(d.range().Units, cf.Units("K")) + def test_Data_root_mean_square(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) w = reshape_array(weights, axis) @@ -3918,47 +3428,67 @@ def test_Data_root_mean_square(self): b = np.ma.asanyarray(b) e = d.root_mean_square(axes=axis, weights=weights, squeeze=True) - e = np.ma.array(e.array) + e = np.ma.array(e.array) self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - + + # Check units + self.assertEqual(d.root_mean_square().Units, cf.Units("K")) + def test_Data_sample_size(self): # Masked array a = self.ma - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) b = np.sum(np.ones_like(b), axis=-1) b = np.ma.asanyarray(b) - + e = d.sample_size(axes=axis, squeeze=True) - e = np.ma.array(e.array) - + e = np.ma.array(e.array) + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) # Non-masked array a = self.a - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) b = np.sum(np.ones_like(b), axis=-1) b = np.asanyarray(b) - + e = d.sample_size(axes=axis, squeeze=True) - e = np.array(e.array) - + e = np.array(e.array) + self.assertTrue(np.allclose(e, b)) - + + # Check units + self.assertEqual(d.sample_size().Units, cf.Units()) + + def test_Data_sd(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + sd = d.sd(weights=weights, ddof=1) + var = d.var(weights=weights, ddof=1) + + # Check units + self.assertEqual(sd.Units, cf.Units("K")) + + self.assertTrue(sd.equals(var.sqrt())) + def test_Data_sum(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) w = reshape_array(weights, axis) @@ -3966,81 +3496,110 @@ def test_Data_sum(self): b = np.ma.asanyarray(b) e = d.sum(axes=axis, weights=weights, squeeze=True) - e = np.ma.array(e.array) + e = np.ma.array(e.array) self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) + # Check units + self.assertEqual(d.sum().Units, cf.Units("K")) + def test_Data_sum_of_squares(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + for axis in axis_combinations(a): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.sum(b * b * w, axis=-1) b = np.ma.asanyarray(b) - + e = d.sum_of_squares(axes=axis, weights=weights, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) + # Check units + self.assertEqual(d.sum_of_squares().Units, cf.Units("K2")) + def test_Data_sum_of_weights(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + # Weights=None + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.sum(np.ones_like(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_weights(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + for axis in axis_combinations(a): b = reshape_array(a, axis) w = reshape_array(weights, axis) w = np.ma.masked_where(b.mask, w) b = np.sum(w, axis=-1) b = np.ma.asanyarray(b) - + e = d.sum_of_weights(axes=axis, weights=weights, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) + # Check units + self.assertEqual(d.sum_of_weights().Units, cf.Units()) + w = cf.Data(weights, "m") + self.assertEqual(d.sum_of_weights(weights=w).Units, cf.Units("m")) + def test_Data_sum_of_weights2(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + # Weights=None + for axis in axis_combinations(a): + e = d.sum_of_weights2(axes=axis) + f = d.sum_of_weights(axes=axis) + self.assertTrue(e.equals(f)) + for axis in axis_combinations(a): b = reshape_array(a, axis) w = reshape_array(weights, axis) w = np.ma.masked_where(b.mask, w) b = np.sum(w * w, axis=-1) b = np.ma.asanyarray(b) - + e = d.sum_of_weights2(axes=axis, weights=weights, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) + # Check units e = d.sum_of_weights2(weights=weights) self.assertEqual(e.Units, cf.Units()) - - e = d.sum_of_weights2(weights=cf.Data(weights, 'km')) - self.assertEqual(e.Units, cf.Units('km2')) - - + + e = d.sum_of_weights2(weights=cf.Data(weights, "m")) + self.assertEqual(e.Units, cf.Units("m2")) + def test_Data_var(self): # Masked array, non-masked weights a = self.ma weights = self.w - d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) - # ddof = 0 + # Weighted ddof = 0 for axis in axis_combinations(a): b = reshape_array(a, axis) w = reshape_array(weights, axis) @@ -4048,18 +3607,17 @@ def test_Data_var(self): mu = mu.reshape(mu.shape + (1,)) w = np.ma.masked_where(b.mask, w) - b = np.sum(w * (b - mu)**2, axis=-1) + b = np.sum(w * (b - mu) ** 2, axis=-1) b = b / V1 b = np.ma.asanyarray(b) - - e = d.var(axes=axis, weights=weights, ddof=0, - squeeze=True) + + e = d.var(axes=axis, weights=weights, ddof=0, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) - self.assertTrue(np.allclose(e, b), e-b) + self.assertTrue(np.allclose(e, b), f"e={e}\nb={b}\ne-b={e-b}") - # ddof = 1 + # Weighted ddof = 1 for axis in axis_combinations(a): b = reshape_array(a, axis) w = reshape_array(weights, axis) @@ -4068,37 +3626,77 @@ def test_Data_var(self): w = np.ma.masked_where(b.mask, w) V2 = np.sum(w * w, axis=-1) - b = np.sum(w * (b - mu)**2, axis=-1) - b = b / (V1 - (V2/V1)) + b = np.sum(w * (b - mu) ** 2, axis=-1) + b = b / (V1 - (V2 / V1)) b = np.ma.asanyarray(b) - - e = d.var(axes=axis, weights=weights, ddof=1, - squeeze=True) + + e = d.var(axes=axis, weights=weights, ddof=1, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - - # Unweighted ddof = 1 + + # Unweighted ddof = 1 for axis in axis_combinations(a): b = reshape_array(a, axis) mu, V1 = np.ma.average(b, axis=-1, returned=True) mu = mu.reshape(mu.shape + (1,)) - w = np.ma.masked_where(b.mask, w) - b = np.sum(w * (b - mu)**2, axis=-1) + b = np.sum((b - mu) ** 2, axis=-1) b = b / (V1 - 1) b = np.ma.asanyarray(b) - + e = d.var(axes=axis, ddof=1, squeeze=True) e = np.ma.array(e.array) - + self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - e = d.var(ddof=0) - self.assertEqual(e.Units, cf.Units("m2")) - + # Check units + self.assertEqual(d.var(ddof=0).Units, cf.Units("K2")) + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "Needs __lt__ and __le__") + def test_Data_mean_of_upper_decile(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.filled(b, np.nan) + with np.testing.suppress_warnings() as sup: + sup.filter( + RuntimeWarning, message=".*All-NaN slice encountered" + ) + p = np.nanpercentile(b, 90, axis=-1, keepdims=True) + + b = np.ma.masked_where(np.isnan(b), b, copy=False) + p = np.where(np.isnan(p), b.max() + 1, p) + + with np.testing.suppress_warnings() as sup: + sup.filter( + RuntimeWarning, + message=".*invalid value encountered in less", + ) + b = np.ma.where(b < p, np.ma.masked, b) + + b = np.ma.average(b, axis=-1, weights=w) + b = np.ma.asanyarray(b) + + e = d.mean_of_upper_decile( + axes=axis, weights=weights, squeeze=True + ) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + # Check units + self.assertEqual(d.mean_of_upper_decile().Units, cf.Units("K2")) + + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) cf.environment() diff --git a/requirements.txt b/requirements.txt index f1964f302d..e254ad5396 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ numpy>=1.22 cfdm>=1.9.0.1, <1.9.1.0 psutil>=0.6.0 cfunits>=3.3.4 +dask>=2022.2.1 From 30540aae6686291ca64e6dfc6d0fe6ce5988db4a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 16 Mar 2022 19:17:57 +0000 Subject: [PATCH 10/37] dev --- cf/data/collapse_functions.py | 76 +--------- cf/data/data.py | 259 +++++++++------------------------- cf/data/utils.py | 4 +- cf/functions.py | 6 +- cf/test/test_Data.py | 111 ++++++++++++--- 5 files changed, 168 insertions(+), 288 deletions(-) diff --git a/cf/data/collapse_functions.py b/cf/data/collapse_functions.py index 835ceb9864..a9a9af6b7c 100644 --- a/cf/data/collapse_functions.py +++ b/cf/data/collapse_functions.py @@ -549,61 +549,6 @@ def sum( weights=weights, ) - # @staticmethod - # def sum_of_squares( - # a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None - # ): - # """Return sum of square values of an array. - # - # Calculates the sum of square value of an array or the sum of - # square values along axes. - # - # See - # https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods - # for mathematical definitions. - # - # .. versionadded:: TODODASK - # - # :Parameters: - # - # a: `dask.array.Array` - # The array to be collapsed. - # - # {{Collapse weights: data_like or `None`, optional}} - # - # {{collapse axes: (sequence of) `int`, optional}} - # - # {{collapse keepdims: `bool`, optional}} - # - # {{mtol: number, optional} - # - # {{split_every: `int` or `dict`, optional}} - # - # :Returns: - # - # `dask.array.Array` - # The collapsed array. - # - # """ - # if weights is None: - # dtype = double_precision_dtype(a) - # else: - # dtype = "f8" - # - # return reduction( - # a, - # partial(cf_sum_chunk, squared=True), - # partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), - # axis=axis, - # keepdims=keepdims, - # dtype=dtype, - # split_every=split_every, - # combine=cf_sum_combine, - # concatenate=False, - # meta=np.array((), dtype=dtype), - # weights=weights, - # ) - @staticmethod def sum_of_weights( a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None @@ -1725,9 +1670,7 @@ def cf_sample_size_agg( # -------------------------------------------------------------------- # sum # -------------------------------------------------------------------- -def cf_sum_chunk( - x, weights=None, dtype="f8", computing_meta=False, squared=False, **kwargs -): +def cf_sum_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): """Chunk calculations for the sum. This function is passed to `dask.array.reduction` as callable @@ -1737,9 +1680,6 @@ def cf_sum_chunk( :Parameters: - squared: `bool`, optional - If True then calculate the weighted sum of the squares. - See `dask.array.reductions` for details. :Returns: @@ -1747,16 +1687,12 @@ def cf_sum_chunk( `dict` Dictionary with the keys: * N: The sample size. - * sum: The weighted sum of ``x``, or the weighted sum of - ``x**2`` if *squared* is True. + * sum: The weighted sum of ``x`` """ if computing_meta: return x - if squared: - x = np.multiply(x, x, dtype=dtype) - if weights is not None: x = np.multiply(x, weights, dtype=dtype) @@ -2026,12 +1962,12 @@ def cf_var_agg( This function is passed to `dask.array.reduction` as callable *aggregate* parameter. - .. note:: If weights are provided then they are interpreted as - reliability weights, as opposed to frequency weights. + .. note:: Weights are interpreted as reliability weights, as + opposed to frequency weights. See https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights - for details + for details. .. versionadded:: TODODASK @@ -2083,7 +2019,7 @@ def cf_var_agg( f"Got: {ddof!r}" ) - # Now get the required global variance + # Now get the required global variance with the requested ddof var = f * var var = mask_small_sample_size(var, d["N"], axis, mtol, original_shape) diff --git a/cf/data/data.py b/cf/data/data.py index d6d405949a..151333f1fc 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -17,7 +17,6 @@ from dask.base import is_dask_collection, tokenize from dask.core import flatten from dask.highlevelgraph import HighLevelGraph -from numpy.testing import suppress_warnings as numpy_testing_suppress_warnings from ..cfdatetime import dt as cf_dt from ..constants import masked as cf_masked @@ -1476,48 +1475,6 @@ def _del_dask(self, default=ValueError(), delete_source=True): return out - def _map_blocks(self, func, **kwargs): - """Apply a function to the data in-place. - - .. warning:: **This method **does not reset the mask - hardness**. It may be necessary for a call to - `_map_blocks` to be followed by a call to - `_reset_mask_hardness` (or equivalent). - - .. versionadded:: TODODASK - - .. seealso:: `_reset_mask_hardness` - - :Parameters: - - func: - The function to be applied to the data, via - `dask.array.map_blocks`, to each chunk of the dask - array. - - kwargs: optional - Keyword arguments passed to the - `dask.array.map_blocks` method. - - :Returns: - - `dask.array.Array` - The updated dask array. - - **Examples:** - - >>> d = cf.Data([1, 2, 3]) - >>> dx = d._map_blocks(lambda x: x / 2) - >>> print(d.array) - [0.5 1. 1.5] - - """ - dx = self._get_dask() - dx = dx.map_blocks(func, **kwargs) - self._set_dask(dx, reset_mask_hardness=False) - - return dx - def _reset_mask_hardness(self): """Re-apply the mask hardness to the dask array. @@ -2026,11 +1983,17 @@ def mean_of_upper_decile( {{weights: data_like, `dict`, or `None`, optional}} + TODODASK - note that weights only applies to the + calculation of the mean, not the upper + decile. + {{collapse squeeze: `bool`, optional}} {{mtol: number, optional} - TODODASK - note that mtol onlty applies calculation of - uppder decile, not the mean! + + TODODASK - note that mtol only applies to the + calculation of the upper decile, not the + mean. include_decile: `bool`, optional TODODASK @@ -2053,9 +2016,10 @@ def mean_of_upper_decile( """ - # TODODASK: Some updates off the back of collapse done, but - # still needs looking at. Unit test has also been - # written, but not run. Needs __lt__ and __le__. + # TODODASK: Some updates off the back of daskifying collapse + # have been done, but still needs looking at. A unit + # test has also been written, but not run. Needs + # __lt__ and __le__. d = _inplace_enabled_define_and_cleanup(self) @@ -2067,7 +2031,7 @@ def mean_of_upper_decile( inplace=False, ) - with numpy_testing_suppress_warnings() as sup: + with np.testing.suppress_warnings() as sup: sup.filter( RuntimeWarning, message=".*invalid value encountered in less.*" ) @@ -3382,7 +3346,9 @@ def _asdatetime(self, inplace=False): ) if not d._isdatetime(): - d._map_blocks(cf_rt2dt, units=units, dtype=object) + dx = d.to_dask_array() + dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -3437,7 +3403,9 @@ def _asreftime(self, inplace=False): ) if d._isdatetime(): - d._map_blocks(cf_dt2rt, units=units, dtype=float) + dx = d.to_dask_array() + dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -5209,7 +5177,9 @@ def cf_Units(x): x=x, from_units=old_units, to_units=value, inplace=False ) - self._map_blocks(cf_Units, dtype=dtype) + dx = self.to_dask_array() + dx = dx.map_blocks(cf_Units, dtype=dtype) + self._set_dask(dx, reset_mask_hardness=False) self._Units = value @@ -9060,7 +9030,9 @@ def harden_mask(self): [1 -- 3] """ - self._map_blocks(cf_harden_mask, dtype=self.dtype) + dx = self.to_dask_array() + dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) + self._set_dask(dx, reset_mask_hardness=False) self._hardmask = True def soften_mask(self): @@ -9091,7 +9063,9 @@ def soften_mask(self): [ 1 999 3] """ - self._map_blocks(cf_soften_mask, dtype=self.dtype) + dx = self.to_dask_array() + dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) + self._set_dask(dx, reset_mask_hardness=False) self._hardmask = False @daskified(_DASKIFIED_VERBOSE) @@ -9144,7 +9118,9 @@ def filled(self, fill_value=None, inplace=False): f"data type {d.dtype.str!r}" ) - d._map_blocks(np.ma.filled, fill_value=fill_value, dtype=d.dtype) + dx = d.to_dask_array() + dx = dx.map_blocks(np.ma.filled, fill_value=fill_value, dtype=d.dtype) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -12393,15 +12369,6 @@ def sum_of_squares( """ d = _inplace_enabled_define_and_cleanup(self) - # d, _ = _collapse( - # Collapse.sum_of_squares, - # d, - # axis=axes, - # weights=weights, - # keepdims=not squeeze, - # split_every=split_every, - # mtol=mtol, - # ) d.square(inplace=True) d.sum( axes=axes, @@ -12777,7 +12744,7 @@ def var( d = _inplace_enabled_define_and_cleanup(self) d, _ = _collapse( - partial(Collapse.var, ddof=ddof), + Collapse.var, d, axis=axes, weights=weights, @@ -12967,7 +12934,7 @@ def sqrt(self, dtype=None, inplace=False): return d # ---------------------------------------------------------------- - # Alias + # Aliases # ---------------------------------------------------------------- @property def dtarray(self): @@ -13018,6 +12985,9 @@ def minimum( i=i, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") def standard_deviation( self, axes=None, @@ -13039,6 +13009,9 @@ def standard_deviation( i=i, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") def variance( self, axes=None, @@ -13099,109 +13072,6 @@ def _size_of_index(index, size=None): return len(index) -def _overlapping_partitions(partitions, indices, axes, master_flip): - """Return the nested list of (modified) partitions which overlap the - given indices to the master array. - - :Parameters: - - partitions : cf.PartitionMatrix - - indices : tuple - - axes : sequence of str - - master_flip : list - - :Returns: - - numpy array - A numpy array of cf.Partition objects. - - **Examples:** - - >>> type(f.Data) - - >>> d._axes - ['dim1', 'dim2', 'dim0'] - >>> axis_to_position = {'dim0': 2, 'dim1': 0, 'dim2' : 1} - >>> indices = (slice(None), slice(5, 1, -2), [1,3,4,8]) - >>> x = _overlapping_partitions(d.partitions, indices, axis_to_position, master_flip) - - """ - - axis_to_position = {} - for i, axis in enumerate(axes): - axis_to_position[axis] = i - - if partitions.size == 1: - partition = partitions.matrix.item() - - # Find out if this partition overlaps the original slice - p_indices, shape = partition.overlaps(indices) - - if p_indices is None: - # This partition is not in the slice out of bounds - raise - # error? - return - - # Still here? Create a new partition - partition = partition.copy() - partition.new_part(p_indices, axis_to_position, master_flip) - partition.shape = shape - - new_partition_matrix = np.empty(partitions.shape, dtype=object) - new_partition_matrix[...] = partition - - return new_partition_matrix - # --- End: if - - # Still here? Then there are 2 or more partitions. - - partitions_list = [] - partitions_list_append = partitions_list.append - - flat_pm_indices = [] - flat_pm_indices_append = flat_pm_indices.append - - partitions_flat = partitions.matrix.flat - - i = partitions_flat.index - - for partition in partitions_flat: - # Find out if this partition overlaps the original slice - p_indices, shape = partition.overlaps(indices) - - if p_indices is None: - # This partition is not in the slice - i = partitions_flat.index - continue - - # Still here? Then this partition overlaps the slice, so - # create a new partition. - partition = partition.copy() - partition.new_part(p_indices, axis_to_position, master_flip) - partition.shape = shape - - partitions_list_append(partition) - - flat_pm_indices_append(i) - - i = partitions_flat.index - # --- End: for - - new_shape = [ - len(set(s)) - for s in np.unravel_index(flat_pm_indices, partitions.shape) - ] - - new_partition_matrix = np.empty((len(flat_pm_indices),), dtype=object) - new_partition_matrix[...] = partitions_list - new_partition_matrix.resize(new_shape) - - return new_partition_matrix - - def _broadcast(a, shape): """Broadcast an array to a given shape. @@ -13309,8 +13179,8 @@ def _collapse( *d*. Must have the minimum signature (parameters and default values) ``func(dx, axis=None, keepdims=False, mtol=None, split_every=None)`` (optionally including - ``weights=None`` or ``ddof=None``), where ``dx`` is a - `dask.array.Array` + ``weights=None`` or ``ddof=None``), where ``dx`` is a the + dask array contained in *d*. d: `Data` The data to be collapsed. @@ -13324,32 +13194,29 @@ def _collapse( data. weights: data_like, `dict`, or `None`, optional - Weights associated with values of the data. By default - *weights* is `None`, meaning that all non-missing - elements of the data have a weight of 1 and all - missing elements have a weight of 0. + *weights* is `None`, meaning that all non-missing elements + of the data have a weight of 1 and all missing elements + have a weight of 0. If *weights* is a data_like object then it must be broadcastable to the array. - If *weights* is a dictionary then each key specifies - axes of the data (an `int` or `tuple` of `int`), with - a corresponding value of data_like weights for those - axes. The dimensions of a weights value must - correspond to its key axes in the same order. Not all - of the axes need weights assigned to them. The weights - that will be used will be an outer product of the - dictionary's values. + If *weights* is a dictionary then each key specifies axes + of the data (an `int` or `tuple` of `int`), with a + corresponding value of data_like weights for those + axes. The dimensions of a weights value must correspond to + its key axes in the same order. Not all of the axes need + weights assigned to them. The weights that will be used + will be an outer product of the dictionary's values. However they are specified, the weights are internally - broadcast to the shape of the data, and those weights - that are missing data, or that correspond to the - missing elements of the data, are assigned a weight of - 0. + broadcast to the shape of the data, and those weights that + are missing data, or that correspond to the missing + elements of the data, are assigned a weight of 0. For collapse functions that do not have a ``weights`` - parameter, *weights * must be `None`. + parameter, *weights* must be `None`. keepdims: `bool`, optional By default, the axes which are collapsed are left in the @@ -13370,9 +13237,8 @@ def _collapse( freedom used in the calculation is (N-*ddof*) where N represents the number of non-missing elements. - For collapses that do need degrees of freedom (such as a - mean), *ddof * must be `None` and *func* need not support - a ``ddof`` parameter. + For collapse functions that do not have a ``ddof`` + parameter, *ddof* must be `None`. split_every: `int` or `dict`, optional Determines the depth of the recursive aggregation. See @@ -13408,6 +13274,10 @@ def _collapse( def _parse_weights(d, weights, axis=None): """Parse the weights input to `_collapse`. + .. versionadded:: TODODASK + + .. seealso:: `_collapse` + :Parameters: d: `Data` @@ -13424,12 +13294,13 @@ def _parse_weights(d, weights, axis=None): `Data` or `None` * If *weights* is a data_like object then they are returned unchanged as a `Data` object. It is up to the - downstream functions to check the weights - broadcastability. + downstream functions to check if the weights can be + broadcast to the data. * If *weights* is a dictionary then the dictionary values', i.e. the weights components, outer product is - broadcast to the data and returned as a `Data` object. + returned in `Data` object that is broadcastable to the + data. If the dictionary is empty, or none of the axes defined by the keys correspond to collapse axes defined by diff --git a/cf/data/utils.py b/cf/data/utils.py index cda7f77c6e..c1103bc89b 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -571,6 +571,8 @@ def YMDhms(d, attr): raise ValueError(f"Can't get {attr}s from data with {units!r}") d = d._asdatetime() - d._map_blocks(partial(cf_YMDhms, attr=attr), dtype=int) + dx = d.to_dask_array() + dx = dx.map_blocks(partial(cf_YMDhms, attr=attr), dtype=int) + d._set_dask(dx, reset_mask_hardness=False) d.override_units(Units(None), inplace=True) return d diff --git a/cf/functions.py b/cf/functions.py index f8bd234ef1..80feb7e285 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -850,9 +850,9 @@ def _parse(cls, arg): into the `CONSTANTS` dictionary. """ - upper_chunksize = (free_memory_factor() * min_total_memory()) / ( - (_WORKSPACE_FACTOR_1()) + _WORKSPACE_FACTOR_2() - ) + # upper_chunksize = (free_memory_factor() * min_total_memory()) / ( + # (_WORKSPACE_FACTOR_1()) + _WORKSPACE_FACTOR_2() + # ) arg = float(arg) if arg <= 0: diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 745499a7e9..c68bb44cb9 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -3297,9 +3297,12 @@ def test_Data_integral(self): e = d.integral(weights=weights) self.assertEqual(e.Units, cf.Units("K")) - e = d.integral(weights=cf.Data(weights, "m")) + e = d.integral(weights=cf.Data(weights, "m"), mtol=0) self.assertEqual(e.Units, cf.Units("K m")) + # Check mtol + self.assertEqual(e.array, np.ma.masked) + def test_Data_max(self): # Masked array a = self.ma @@ -3317,7 +3320,11 @@ def test_Data_max(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.range().Units, cf.Units("K")) + e = d.max(mtol=0) + self.assertEqual(e.Units, cf.Units("K")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_maximum_absolute_value(self): # Masked array @@ -3336,7 +3343,11 @@ def test_Data_maximum_absolute_value(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.range().Units, cf.Units("K")) + e = d.maximum_absolute_value(mtol=0) + self.assertEqual(e.Units, cf.Units("K")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_mean(self): # Masked array, non-masked weights @@ -3357,7 +3368,11 @@ def test_Data_mean(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.range().Units, cf.Units("K")) + e = d.mean(mtol=0) + self.assertEqual(e.Units, cf.Units("K")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_mean_absolute_value(self): # Masked array, non-masked weights @@ -3378,7 +3393,11 @@ def test_Data_mean_absolute_value(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.range().Units, cf.Units("K")) + e = d.mean_absolute_value(mtol=0) + self.assertEqual(e.Units, cf.Units("K")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_mid_range(self): # Masked array, non-masked weights @@ -3397,7 +3416,11 @@ def test_Data_mid_range(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.range().Units, cf.Units("K")) + e = d.mid_range(mtol=0) + self.assertEqual(e.Units, cf.Units("K")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_min(self): # Masked array @@ -3416,7 +3439,11 @@ def test_Data_min(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.range().Units, cf.Units("K")) + e = d.min(mtol=0) + self.assertEqual(e.Units, cf.Units("K")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_minimum_absolute_value(self): # Masked array @@ -3435,7 +3462,11 @@ def test_Data_minimum_absolute_value(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.range().Units, cf.Units("K")) + e = d.minimum_absolute_value(mtol=0) + self.assertEqual(e.Units, cf.Units("K")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_range(self): # Masked array @@ -3454,7 +3485,11 @@ def test_Data_range(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.range().Units, cf.Units("K")) + e = d.range(mtol=0) + self.assertEqual(e.Units, cf.Units("K")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_root_mean_square(self): # Masked array, non-masked weights @@ -3475,7 +3510,11 @@ def test_Data_root_mean_square(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.root_mean_square().Units, cf.Units("K")) + e = d.root_mean_square(mtol=0) + self.assertEqual(e.Units, cf.Units("K")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_sample_size(self): # Masked array @@ -3508,7 +3547,12 @@ def test_Data_sample_size(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.sample_size().Units, cf.Units()) + d = cf.Data(self.ma, "K", chunks=(2, 3, 2, 5)) + e = d.sample_size(mtol=0) + self.assertEqual(e.Units, cf.Units()) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_sd(self): # Masked array, non-masked weights @@ -3519,10 +3563,14 @@ def test_Data_sd(self): sd = d.sd(weights=weights, ddof=1) var = d.var(weights=weights, ddof=1) + self.assertTrue(sd.equals(var.sqrt())) + # Check units self.assertEqual(sd.Units, cf.Units("K")) - self.assertTrue(sd.equals(var.sqrt())) + # Check mtol + sd = d.sd(ddof=0, mtol=0) + self.assertEqual(sd.array, np.ma.masked) def test_Data_sum(self): # Masked array, non-masked weights @@ -3543,7 +3591,11 @@ def test_Data_sum(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.sum().Units, cf.Units("K")) + e = d.sum(mtol=0) + self.assertEqual(e.Units, cf.Units("K")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_sum_of_squares(self): # Masked array, non-masked weights @@ -3564,7 +3616,11 @@ def test_Data_sum_of_squares(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.sum_of_squares().Units, cf.Units("K2")) + e = d.sum_of_squares(mtol=0) + self.assertEqual(e.Units, cf.Units("K2")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_sum_of_weights(self): # Masked array, non-masked weights @@ -3598,9 +3654,13 @@ def test_Data_sum_of_weights(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.sum_of_weights().Units, cf.Units()) - w = cf.Data(weights, "m") - self.assertEqual(d.sum_of_weights(weights=w).Units, cf.Units("m")) + e = d.sum_of_weights() + self.assertEqual(e.Units, cf.Units()) + e = d.sum_of_weights(weights=cf.Data(weights, "m"), mtol=0) + self.assertEqual(e.Units, cf.Units("m")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) def test_Data_sum_of_weights2(self): # Masked array, non-masked weights @@ -3631,9 +3691,12 @@ def test_Data_sum_of_weights2(self): e = d.sum_of_weights2(weights=weights) self.assertEqual(e.Units, cf.Units()) - e = d.sum_of_weights2(weights=cf.Data(weights, "m")) + e = d.sum_of_weights2(weights=cf.Data(weights, "m"), mtol=0) self.assertEqual(e.Units, cf.Units("m2")) + # Check mtol + self.assertEqual(e.array, np.ma.masked) + def test_Data_var(self): # Masked array, non-masked weights a = self.ma @@ -3694,7 +3757,11 @@ def test_Data_var(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.var(ddof=0).Units, cf.Units("K2")) + e = d.var(ddof=0, mtol=0) + self.assertEqual(e.Units, cf.Units("K2")) + + # Check mtol + self.assertEqual(e.array, np.ma.masked) @unittest.skipIf(TEST_DASKIFIED_ONLY, "Needs __lt__ and __le__") def test_Data_mean_of_upper_decile(self): @@ -3735,7 +3802,11 @@ def test_Data_mean_of_upper_decile(self): self.assertTrue(np.allclose(e, b)) # Check units - self.assertEqual(d.mean_of_upper_decile().Units, cf.Units("K2")) + e = d.mean_of_upper_decile(mtol=0) + self.assertEqual(e.Units, cf.Units("K2")) + + # Check mtol + self.assertEqual(e.array.item(), np.ma.masked) if __name__ == "__main__": From ade6277ea3b3a9deb93a67cc81115b4a9e697712 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 17 Mar 2022 10:31:38 +0000 Subject: [PATCH 11/37] docs tidy --- cf/formula_terms.py | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/cf/formula_terms.py b/cf/formula_terms.py index 8ecb993ecf..17421b94ee 100644 --- a/cf/formula_terms.py +++ b/cf/formula_terms.py @@ -1,6 +1,6 @@ import logging -import cfdm +from cfdm.core import DocstringRewriteMeta from .constants import ( formula_terms_computed_standard_names, @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) -class FormulaTerms(metaclass=cfdm.core.DocstringRewriteMeta): +class FormulaTerms(metaclass=DocstringRewriteMeta): """Functions for computing non-parametric vertical coordinates from the formula defined by a coordinate reference construct. @@ -225,8 +225,6 @@ def _computed_standard_name(f, standard_name, coordinate_reference): """Find the standard name of the computed non-parametric vertical coordinates. - {{formula terms links}} - .. versionadded:: 3.8.0 :Parameters: @@ -558,8 +556,6 @@ def _check_standard_name_consistency( s-coordinate, the ocean_sigma over z coordinate, and the ocean double sigma coordinate. - {{formula terms links}} - .. versionadded:: 3.8.0 :Parameters: @@ -680,8 +676,6 @@ def atmosphere_ln_pressure_coordinate( """Compute non-parametric vertical coordinates from atmosphere_ln_pressure_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -761,8 +755,6 @@ def atmosphere_sigma_coordinate( """Compute non-parametric vertical coordinates from atmosphere_sigma_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -858,8 +850,6 @@ def atmosphere_hybrid_sigma_pressure_coordinate( atmosphere_hybrid_sigma_pressure_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -989,8 +979,6 @@ def atmosphere_hybrid_height_coordinate( """Compute non-parametric vertical coordinates from atmosphere_hybrid_height_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1087,8 +1075,6 @@ def atmosphere_sleve_coordinate( """Compute non-parametric vertical coordinates from atmosphere_sleve_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1242,8 +1228,6 @@ def ocean_sigma_coordinate( """Compute non-parametric vertical coordinates from ocean_sigma_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1347,8 +1331,6 @@ def ocean_s_coordinate( """Compute non-parametric vertical coordinates from ocean_s_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1489,8 +1471,6 @@ def ocean_s_coordinate_g1( """Compute non-parametric vertical coordinates from ocean_s_coordinate_g1 parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1618,8 +1598,6 @@ def ocean_s_coordinate_g2( """Compute non-parametric vertical coordinates from ocean_s_coordinate_g2 parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1747,8 +1725,6 @@ def ocean_sigma_z_coordinate( """Compute non-parametric vertical coordinates from ocean_sigma_z_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1897,8 +1873,6 @@ def ocean_double_sigma_coordinate( """Compute non-parametric vertical coordinates from ocean_double_sigma_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -2051,8 +2025,6 @@ def formula( ): """Compute non-parametric vertical coordinates. - {{formula terms links}} - Dimensional vertical auxiliary coordinate values are computed from parametric vertical coordinate values (usually dimensionless) and associated domain ancillary constructs, as defined by the formula From d5f884dff8f34bafef9bd5facb1d8411631a9c82 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 17 Mar 2022 10:51:04 +0000 Subject: [PATCH 12/37] docs tidy --- cf/formula_terms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cf/formula_terms.py b/cf/formula_terms.py index 17421b94ee..bf1bdd1214 100644 --- a/cf/formula_terms.py +++ b/cf/formula_terms.py @@ -64,13 +64,13 @@ def __docstring_substitutions__(self): return _docstring_substitution_definitions def __docstring_package_depth__(self): - """Return the package depth for {{package}} docstring + """Return the package depth for "package" docstring substitutions. See `_docstring_package_depth` for details. """ - return 1 + return 0 # ---------------------------------------------------------------- # Private methods From b132139dd9350c700a1e7df91a5f5dd3f32fdd36 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 17 Mar 2022 11:13:02 +0000 Subject: [PATCH 13/37] getting there --- .../{collapse_functions.py => collapse.py} | 210 ++++++++++++++---- cf/data/data.py | 174 +++++++++------ cf/docstring/docstring.py | 37 ++- cf/test/test_Data.py | 6 + 4 files changed, 295 insertions(+), 132 deletions(-) rename cf/data/{collapse_functions.py => collapse.py} (89%) diff --git a/cf/data/collapse_functions.py b/cf/data/collapse.py similarity index 89% rename from cf/data/collapse_functions.py rename to cf/data/collapse.py index a9a9af6b7c..52d34e95d1 100644 --- a/cf/data/collapse_functions.py +++ b/cf/data/collapse.py @@ -1,3 +1,4 @@ +import inspect from functools import partial, reduce from operator import mul @@ -7,16 +8,50 @@ from dask.array.core import _concatenate2 from dask.array.reductions import divide, numel, reduction from dask.core import flatten -from dask.utils import deepmap # Apply function inside nested lists +from dask.utils import deepmap + +from ..docstring import _docstring_substitution_definitions class Collapse(metaclass=DocstringRewriteMeta): - """Container for functions that collapse `dask` arrays. + """Container for functions that collapse dask arrays. .. versionadded:: TODODASK """ + def __docstring_substitutions__(self): + """Define docstring substitutions that apply to this class and + all of its subclasses. + + These are in addtion to, and take precendence over, docstring + substitutions defined by the base classes of this class. + + See `_docstring_substitutions` for details. + + .. versionadded:: TODODASK + + .. seealso:: `_docstring_substitutions` + + :Returns: + + `dict` + The docstring substitutions that have been applied. + + """ + return _docstring_substitution_definitions + + def __docstring_package_depth__(self): + """Return the package depth for "package" docstring + substitutions. + + See `_docstring_package_depth` for details. + + .. versionadded:: TODODASK + + """ + return 0 + @staticmethod def max(a, axis=None, keepdims=False, mtol=None, split_every=None): """Return maximum values of an array. @@ -39,7 +74,7 @@ def max(a, axis=None, keepdims=False, mtol=None, split_every=None): {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -49,6 +84,7 @@ def max(a, axis=None, keepdims=False, mtol=None, split_every=None): The collapsed array. """ + check_input_dtype(a) dtype = a.dtype return reduction( a, @@ -85,7 +121,7 @@ def max_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -95,6 +131,7 @@ def max_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): The collapsed array. """ + check_input_dtype(a) dtype = a.dtype return reduction( a, @@ -135,7 +172,7 @@ def mean( {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -145,6 +182,7 @@ def mean( The collapsed array. """ + check_input_dtype(a) dtype = "f8" return reduction( a, @@ -186,7 +224,7 @@ def mean_abs( {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -196,6 +234,7 @@ def mean_abs( The collapsed array. """ + check_input_dtype(a) dtype = "f8" return reduction( a, @@ -235,7 +274,7 @@ def mid_range( {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -245,6 +284,7 @@ def mid_range( The collapsed array. """ + check_input_dtype(a, allow="fi") dtype = "f8" return reduction( a, @@ -281,7 +321,7 @@ def min(a, axis=None, keepdims=False, mtol=None, split_every=None): {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -291,6 +331,7 @@ def min(a, axis=None, keepdims=False, mtol=None, split_every=None): The collapsed array. """ + check_input_dtype(a) dtype = a.dtype return reduction( a, @@ -327,7 +368,7 @@ def min_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -337,6 +378,7 @@ def min_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): The collapsed array. """ + check_input_dtype(a) dtype = a.dtype return reduction( a, @@ -373,7 +415,7 @@ def range(a, axis=None, keepdims=False, mtol=None, split_every=None): {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -383,6 +425,7 @@ def range(a, axis=None, keepdims=False, mtol=None, split_every=None): The collapsed array. """ + check_input_dtype(a, allow="fi") dtype = a.dtype return reduction( a, @@ -423,7 +466,7 @@ def rms( {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -433,6 +476,7 @@ def rms( The collapsed array. """ + check_input_dtype(a) dtype = "f8" return reduction( a, @@ -470,7 +514,7 @@ def sample_size(a, axis=None, keepdims=False, mtol=None, split_every=None): {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -480,6 +524,7 @@ def sample_size(a, axis=None, keepdims=False, mtol=None, split_every=None): The collapsed array. """ + check_input_dtype(a) dtype = "i8" return reduction( a, @@ -520,7 +565,7 @@ def sum( {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -530,6 +575,7 @@ def sum( The collapsed array. """ + check_input_dtype(a) if weights is None: dtype = double_precision_dtype(a) else: @@ -575,7 +621,7 @@ def sum_of_weights( {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -585,6 +631,7 @@ def sum_of_weights( The collapsed array. """ + check_input_dtype(a) dtype = "f8" return reduction( a, @@ -626,7 +673,7 @@ def sum_of_weights2( {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -636,6 +683,7 @@ def sum_of_weights2( The collapsed array. """ + check_input_dtype(a) dtype = "f8" return reduction( a, @@ -683,7 +731,7 @@ def var( {{collapse keepdims: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{ddof: number}} @@ -695,6 +743,7 @@ def var( The collapsed array. """ + check_input_dtype(a) dtype = "f8" return reduction( a, @@ -711,12 +760,44 @@ def var( ) -def double_precision_dtype(a): - """Return the double precision data type of an aray. +def check_input_dtype(a, allow="fib"): + """Check that data has a data type allowed by a collapse method. + + The collapse method is assumed to be defined by the calling + function. :Parameters: - a: array_like + a: `dask.array.Array` + The data. + + allow: `str`, optional + The data type kinds allowed by the collapse + method. Defaults to ``'fib'``, meaning that only float, + integer and Boolean data types are allowed. + + :Returns: + + `None` + + """ + if a.dtype.kind not in allow: + method = inspect.currentframe().f_back.f_code.co_name + raise TypeError(f"Can't calculate {method} of data with {a.dtype!r}") + + +def double_precision_dtype(a, bool_type="i"): + """Returns the corresponding double precision data type of an array. + + :Parameters: + + a: `dask.array.Array` + The data. + + bool_type: `str`, optional + The corresponding double data type kind for Boolean + data. Defaults to ``'i'``, meaning ``'i8'`` is + returned. Set to ``'f'` to return ``'f8'`` instead. :Returns: @@ -725,21 +806,30 @@ def double_precision_dtype(a): **Examples** - >>> for d in (float, 'float32', int, 'int32'): + >>> for d in (int, 'int32', float, 'float32', bool): ... print(double_precision_dtype(np.array(1, dtype=d))) ... + i8 + i8 f8 f8 i8 - i8 + >>> double_precision_dtype(np.array(1, dtype=bool), bool_type='f') + 'f8' """ - return a.dtype.kind + "8" + kind = a.dtype.kind + if kind == "b": + return bool_type + "8" + + if kind in "fi": + return kind + "8" + + raise TypeError("Can't collapse data with {a.dtype!r}") def mask_small_sample_size(x, N, axis, mtol, original_shape): - """Mask elements where the sample size of the collapsed data is - below a threshold. + """Mask elements where the sample size is below a threshold. .. versionadded:: TODODASK @@ -758,18 +848,19 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): The sample size threshold below which collapsed values are set to missing data. It is defined as a fraction (between 0 and 1 inclusive) of the contributing input data values. - A missing datum in the output array occurs whenever at - least ``100*mtol%`` of its contributing input array - elements are non-missing data. The default of *mtol* is 1, - meaning that a missing datum in the output array only - occurs when all of its contributing input array elements - are missing data. A value of 0 means that a missing datum - in the output array occurs whenever any of its - contributing input array elements are missing. Any - intermediate value is allowed. Note that for non-zero - values of *mtol*, different collapsed elements may have - different sample sizes, depending on the distribution of - missing data in the input data. + + The default of *mtol* is 1, meaning that a missing datum + in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the input + data. original_shape: `tuple` The shape of the original, uncollapsed data. @@ -795,16 +886,41 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): return x -def sum_weights(x, weights=None, squared=False, N=None, dtype="f8", **kwargs): - """TODO. +def sum_weights_chunk( + x, weights=None, squared=False, N=None, dtype="f8", **kwargs +): + """Sum the weights. .. versionadded:: TODODASK :Parameters: + x: `numpy.ndarray` + The collapsed data. + + weights: `numpy.ndarray`, optional + The weights associated with values of the data. Must have + the same shape as *x*. By default *weights* is `None`, + meaning that all non-missing elements of the data have a + weight of 1 and all missing elements have a weight of + 0. If given as an array then those weights that are + missing data, or that correspond to the missing elements + of the data, are assigned a weight of 0. + squared: `bool`, optional If True calculate the sum of the squares of the weights. + N: `numpy.ndarray`, optional + The sample size. If provided as an array and there are no + weights, then the *N* is returned instead of calculating + the sum (of the squares) of weights. Ignored of *weights* + is not `None`. + + :Returns: + + `numpy.ndarray` + The sum of the weights. + """ if weights is None: # All weights are 1, so the sum of the weights and the sum of @@ -827,12 +943,14 @@ def sum_weights(x, weights=None, squared=False, N=None, dtype="f8", **kwargs): def combine_arrays( pairs, key, func, axis, dtype=None, computing_meta=False, **kwargs ): - """Worker function for Combine functions. + """Worker function for Combine callables. Select arrays by dictionary key from a nested list of - dictionaries, concatenate the resulting a nested list of arrays - along the axes, and apply a function to the result along the same - axes. + dictionaries, concatenate the resulting nested list of arrays + along the specified axes, and then apply a function to the result + along those same axes. + + See `dask.array.reductions.mean_combine` for an example. .. versionadded:: TODODASK @@ -926,7 +1044,7 @@ def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): # N, sum d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) - d["V1"] = sum_weights(x, weights, N=d["N"], **kwargs) + d["V1"] = sum_weights_chunk(x, weights, N=d["N"], **kwargs) d["weighted"] = weights is not None @@ -1813,7 +1931,7 @@ def cf_sum_of_weights_chunk( # N d = cf_sample_size_chunk(x, **kwargs) - d["sum"] = sum_weights( + d["sum"] = sum_weights_chunk( x, weights=weights, squared=squared, N=d["N"], **kwargs ) @@ -1885,7 +2003,9 @@ def cf_var_chunk( d["part"] = part if ddof == 1: - d["V2"] = sum_weights(x, weights, squared=True, N=d["N"], **kwargs) + d["V2"] = sum_weights_chunk( + x, weights, squared=True, N=d["N"], **kwargs + ) else: d["V2"] = d["N"] diff --git a/cf/data/data.py b/cf/data/data.py index 151333f1fc..ace84ceedd 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -47,7 +47,7 @@ NetCDFArray, UMArray, ) -from .collapse_functions import Collapse +from .collapse import Collapse from .creation import ( compressed_to_dask, convert_to_builtin_type, @@ -6976,7 +6976,7 @@ def max( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `maximum_absolute_value`, `min` + ..seealso:: `sample_size`, `maximum_absolute_value`, `min` :Parameters: @@ -7045,7 +7045,7 @@ def maximum_absolute_value( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `max`, `minimum_absolute_value` + ..seealso:: `sample_size`, `max`, `minimum_absolute_value` :Parameters: @@ -7115,7 +7115,7 @@ def min( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `max`, `minimum_absolute_value` + ..seealso:: `sample_size`, `max`, `minimum_absolute_value` :Parameters: @@ -7183,7 +7183,7 @@ def minimum_absolute_value( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `maximum_absolute_value`, `min` + ..seealso:: `sample_size`, `maximum_absolute_value`, `min` :Parameters: @@ -7254,7 +7254,7 @@ def mean( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `mean_abslute_value`, `sd`, `sum` + ..seealso:: `sample_size`, `mean_abslute_value`, `sd`, `sum` :Parameters: @@ -7332,7 +7332,7 @@ def mean_absolute_value( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `mean`, `sd`, `sum` + ..seealso:: `sample_size`, `mean`, `sd`, `sum` :Parameters: @@ -7409,7 +7409,7 @@ def integral( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `mean`, `sd`, `sum` + ..seealso:: `sample_size`, `mean`, `sd`, `sum` :Parameters: @@ -7503,6 +7503,8 @@ def sample_size( Calculates the sample size value or the sample size values along axes. + .. seealso:: `sum_of_weights` + :Parameters: {{collapse axes: (sequence of) `int`, optional}} @@ -10157,17 +10159,25 @@ def mid_range( inplace=False, i=False, ): - """Collapse axes with the absolute difference between their - maximum and minimum values. + """Calculate mid-range values. + + The mid-range is half of the maximum plus the minimum. - Missing data array elements are omitted from the calculation. + Calculates the mid-range value or the mid-range values along + axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sample_size`, - `sd`, `sum`, `sum_of_weights`, `sum_of_weights2`, - `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `max`, `min`, `range` :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + {{mtol: number, optional} {{split_every: `int` or `dict`, optional}} @@ -10183,7 +10193,18 @@ def mid_range( `Data` or `None` The collapsed array. - **Examples:** + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.mid_range() + """ d = _inplace_enabled_define_and_cleanup(self) @@ -10490,10 +10511,15 @@ def root_mean_square( split_every=None, inplace=False, ): - """Collapse axes with their root mean square. + """Calculate root mean square (RMS) values. + + Calculates the RMS value or the RMS values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - Missing data array elements and their corresponding weights are - omitted from the calculation. + ..seealso:: `sample_size`, `mean`, `sum`, :Parameters: @@ -10516,10 +10542,24 @@ def root_mean_square( `Data` or `None` The collapsed array. - .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, - `var` + **Examples** - **Examples:** + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.root_mean_square() + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.root_mean_square(weights=w) + """ d = _inplace_enabled_define_and_cleanup(self) @@ -12107,41 +12147,23 @@ def range( inplace=False, i=False, ): - """Collapse axes with the absolute difference between their - maximum and minimum values. + """Calculate range values. - Missing data array elements are omitted from the calculation. + The range is the maximum minus the minimum. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sample_size`, - `sd`, `sum`, `sum_of_weights`, `sum_of_weights2`, - `var` - - :Parameters: + Calculates the range value or the range values along axes. - split_every: `int` or `dict`, optional - Determines the depth of the recursive aggregation. See - `dask.array.reduction` for details. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + ..seealso:: `sample_size`, `max`, `min`, `mid_range` - set to a number greater than to equal to the number of - input chunks, the aggregation will be performed in two - steps, one ``chunk`` function per input chunk and a - single ``aggregate`` function at the end. If set to - less than that (and greater than 1), an intermediate - ``combine`` function will be used, so that any one - ``combine`` or ``aggregate`` function has no more than - ``split_every`` inputs. The depth of the aggregation - graph will be :math:`log_{split_every}(input chunks - along reduced axes)`. Setting to a low value can - reduce cache size and network transfers, at the cost - of more CPU and a larger dask graph. + :Parameters: - Different values can be assigned to different axes in - a dictionary. + {{collapse axes: (sequence of) `int`, optional}} - Omit to let dask heuristically decide a good - default. A default can also be set globally with the - ``split_every`` key in :mod:`dask.config`. + {{collapse squeeze: `bool`, optional}} {{mtol: number, optional} @@ -12158,7 +12180,18 @@ def range( `Data` or `None` The collapsed array. - **Examples:** + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.range() + """ d = _inplace_enabled_define_and_cleanup(self) @@ -12244,8 +12277,8 @@ def sum( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `integral`, `mean`, `sd`, `sum_of_squares`, - `sum_of_weights` + ..seealso:: `sample_size`, `integral`, `mean`, `sd`, + `sum_of_squares`, `sum_of_weights` :Parameters: @@ -12323,7 +12356,7 @@ def sum_of_squares( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `sum`, `sum_of_squares`, + ..seealso:: `sample_size`, `sum`, `sum_of_squares`, `sum_of_weights2` :Parameters: @@ -12408,7 +12441,7 @@ def sum_of_weights( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `sum`, `sum_of_squares`, + ..seealso:: `sample_size`, `sum`, `sum_of_squares`, `sum_of_weights2` :Parameters: @@ -12507,8 +12540,8 @@ def sum_of_weights2( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `sum`, `sum_of_squares`, - `sum_of_weights2` + ..seealso:: `sample_size`, `sum`, `sum_of_squares`, + `sum_of_weights` :Parameters: @@ -12603,7 +12636,7 @@ def sd( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `mean`, `sum`, `var` + ..seealso:: `sample_size`, `mean`, `sum`, `var` :Parameters: @@ -12689,7 +12722,7 @@ def var( https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods for mathematical definitions. - ..seealso:: `mean`, `sd`, `sum` + ..seealso:: `sample_size`, `mean`, `sd`, `sum` :Parameters: @@ -12846,6 +12879,8 @@ def square(self, dtype=None, inplace=False): **Examples** >>> d = cf.Data([[0, 1, 2.5, 3, 4]], 'K', mask=[[0, 0, 0, 1, 0]]) + >>> print(d.array) + [[0.0 1.0 2.5 -- 4.0]] >>> e = d.square() >>> e @@ -12893,14 +12928,16 @@ def sqrt(self, dtype=None, inplace=False): **Examples** >>> d = cf.Data([[0, 1, 2, 3, 4]], 'K2', mask=[[0, 0, 0, 1, 0]]) + >>>print(d.array) + [[0 1 2 -- 4]] >>> e = d.sqrt() >>> e >>> print(e.array) [[0.0 1.0 1.4142135623730951 -- 2.0]] - Negative values raise a warning but result in either NaN or, - if the there are already missing values, missing data: + Negative values raise a warning but nonetheless result in NaN + or, if the there are already missing values, missing data: >>> import warnings >>> d = cf.Data([0, 1, -4]) @@ -13225,12 +13262,17 @@ def _collapse( to False then collapsed axes are removed from the data. mtol: number, optional - Set the sampe size threshold below which collapsed values - are set to missing data. It is defined as a fraction - (between 0 and 1 inclusive) of the contributing input data - values. A missing datum in the output array occurs - whenever at least ``100*mtol%`` of its contributing input - array elements are missing data. + The sample size threshold below which collapsed values are + set to missing data. It is defined as a fraction (between + 0 and 1 inclusive) of the contributing input data values. + + The default of *mtol* is 1, meaning that a missing datum + in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. ddof: number, optional The delta degrees of freedom. The number of degrees of diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index e6a8f44902..7661b8078f 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -28,14 +28,8 @@ # ---------------------------------------------------------------- # Class description susbstitutions (1 level of indentation) # ---------------------------------------------------------------- - "{{formula terms links}}": """See the parametric vertical coordinate sections of the CF - conventions for more details: - - `4.3.3. Parametric Vertical Coordinate - `_ - - `Appendix D: Parametric Vertical Coordinates - `_""", + "{{formula terms links}}": """See CF section 4.3.3 "Parametric Vertical Coordinate" and CF + Appendix D "Parametric Vertical Coordinates" for details.""", # ---------------------------------------------------------------- # Class description susbstitutions (1 level of indentation) # ---------------------------------------------------------------- @@ -297,19 +291,20 @@ The sample size threshold below which collapsed values are set to missing data. It is defined as a fraction (between 0 and 1 inclusive) of the contributing input - data values. A missing datum in the output array - occurs whenever at least ``100*mtol%`` of its - contributing input array elements are missing - data. The default of *mtol* is 1, meaning that a - missing datum in the output array only occurs when all - of its contributing input array elements are missing - data. A value of 0 means that a missing datum in the - output array occurs whenever any of its contributing - input array elements are missing. Any intermediate - value is allowed. Note that for non-zero values of - *mtol*, different collapsed elements may have - different sample sizes, depending on the distribution - of missing data in the input data.""", + data values. + + The default of *mtol* is 1, meaning that a missing + datum in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the + input data.""", # ddof "{{ddof: number}}": """ddof: number The delta degrees of freedom, a non-negative diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index c68bb44cb9..74c35cad98 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -3422,6 +3422,9 @@ def test_Data_mid_range(self): # Check mtol self.assertEqual(e.array, np.ma.masked) + with self.assertRaises(TypeError): + cf.Data([0, 1], dtype=bool).mid_range() + def test_Data_min(self): # Masked array a = self.ma @@ -3491,6 +3494,9 @@ def test_Data_range(self): # Check mtol self.assertEqual(e.array, np.ma.masked) + with self.assertRaises(TypeError): + cf.Data([0, 1], dtype=bool).range() + def test_Data_root_mean_square(self): # Masked array, non-masked weights a = self.ma From 41306d0064516e0a0459d0552a56ce9ce4662763 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 17 Mar 2022 19:11:30 +0000 Subject: [PATCH 14/37] collapse docs --- docs/source/field_analysis.rst | 156 ++++++++++++++++++++------------- 1 file changed, 97 insertions(+), 59 deletions(-) diff --git a/docs/source/field_analysis.rst b/docs/source/field_analysis.rst index 3ea43360f5..87c11d5850 100644 --- a/docs/source/field_analysis.rst +++ b/docs/source/field_analysis.rst @@ -131,84 +131,116 @@ the new cell method construct (if one is created). ============================ ======================================== ========================== Method Description Cell method ============================ ======================================== ========================== -``'maximum'`` The maximum of the values. ``maximum`` +``'sample_size'`` The sample size, :math:`N`, equal to the ``point`` + number of non-missing values. + +``'maximum'`` The maximum of the non-missing values. ``maximum`` + + .. math:: m_x=\max\{x_0, \ldots, x_N\} -``'minimum'`` The minimum of the values. ``minimum`` +``'minimum'`` The minimum of the non-missing values. ``minimum`` + + .. math:: m_n=\min\{x_0, \ldots, x_N\} -``'maximum_absolute_value'`` The maximum of the absolute values. ``maximum_absolute_value`` +``'maximum_absolute_value'`` The maximum of the non-missing absolute ``maximum_absolute_value`` + values. + + .. math:: \max\{|x_0|, \ldots, |x_N|\} -``'minimum_absolute_value'`` The minimum of the absolute values. ``minimum_absolute_value`` +``'minimum_absolute_value'`` The minimum of the absolute absolute ``minimum_absolute_value`` + values. + + .. math:: \min\{|x_0|, \ldots, |x_N|\} ``'mid_range'`` The average of the maximum and the ``mid_range`` - minimum of the values. + minimum of the non-missing values. + + .. math:: \frac{m_x + m_n}{2} ``'range'`` The absolute difference between the ``range`` - maximum and the minimum of the values. + maximum and the minimum of the + non-missing values. + + .. math:: m_x - m_n -``'median'`` The median of the values. ``median`` +``'median'`` The median of the :math:`N` non-missing ``median`` + values. -``'sample_size'`` The sample size, :math:`N`, as would be ``point`` - used for other calculations, i.e. the - number of non-missing values. - -``'sum_of_weights'`` The sum of :math:`N` weights ``sum`` - :math:`w_i`, as would be used for other - calculations, is +``'sum_of_weights'`` The sum of the :math:`N` weights ``sum`` + :math:`w_i` that correspond to + non-missing values. .. math:: V_{1}=\sum_{i=1}^{N} w_i -``'sum_of_weights2'`` The sum of the squares of :math:`N` ``sum`` - weights :math:`w_i`, as would be used - for other calculations, is +``'sum_of_weights2'`` The sum of the squares of the :math:`N` ``sum`` + weights :math:`w_i` that correspond to + non-missing values. - .. math:: V_{2}=\sum_{i=1}^{N} w_i^{2} + .. math:: V_{2}=\sum_{i=1}^{N} w_i^{2} -``'sum'`` The unweighted sum of :math:`N` values ``sum`` - :math:`x_i` is +``'sum'`` The unweighted sum of the :math:`N` ``sum`` + non-missing values :math:`x_i` is .. math:: t=\sum_{i=1}^{N} x_i -``'sum_of_squares'`` The unweighted sum of the squares of ``sum_of_squares`` - :math:`N` values :math:`x_i` is + The weighted sum of the :math:`N` + non-missing values :math:`x_i` with + corresponding weights :math:`w_i` is + + .. math:: \hat{t}=\sum_{i=1}^{N} w_i x_i + +``'sum_of_squares'`` The unweighted sum of the squares of the ``sum_of_squares`` + :math:`N` non-missing values :math:`x_i` + is .. math:: t_2=\sum_{i=1}^{N} x_{i}^{2} -``'integral'`` The integral of :math:`N` values ``sum`` - :math:`x_i` with corresponding cell - measures :math:`m_i` is + The weighted sum of the squares of the + :math:`N` non-missing values :math:`x_i` + with corresponding weights :math:`w_i` + is + + .. math:: \hat{t}_2=\sum_{i=1}^{N} + w_i x_{i}^{2} + +``'integral'`` The :ref:`weighted ` ``sum`` + sum of the :math:`N` non-missing values + :math:`x_i` with corresponding cell + measures :math:`m_i`. .. math:: i=\sum_{i=1}^{N} m_i x_i - Note that the integral differs from a - weighted sum in that the units of the - cell measures are incorporated into the - result. + .. note:: The integral differs from a + weighted sum in that the units + of the measures are + incorporated into the result. -``'mean'`` The unweighted mean of :math:`N` values ``mean`` - :math:`x_i` is +``'mean'`` The unweighted mean of the :math:`N` ``mean`` + non-missing values :math:`x_i` is .. math:: \mu=\frac{1}{N}\sum_{i=1}^{N} x_i The :ref:`weighted ` - mean of :math:`N` values :math:`x_i` - with corresponding weights :math:`w_i` - is + mean of the :math:`N` non-missing values + :math:`x_i` with corresponding weights + :math:`w_i`. is .. math:: \hat{\mu}=\frac{1}{V_{1}} \sum_{i=1}^{N} - w_i x_i + w_i x_i -``'mean_absolute_value'`` The unweighted mean of :math:`N` ``mean_absolute_value`` - values :math:`x_i` absoluted is +``'mean_absolute_value'`` The unweighted mean of the :math:`N` ``mean_absolute_value`` + non-missing absolute values :math:`x_i` + is .. math:: \mu_{abs}=\frac{1}{N} \sum_{i=1}^{N}|x_i| The :ref:`weighted ` - mean of :math:`N` values :math:`x_i` - absoluted with corresponding weights - :math:`w_i` is + mean of the :math:`N` non-missing + absolute values :math:`x_i` with + corresponding weights :math:`w_i` is .. math:: \hat{\mu}_{abs}= \frac{1}{V_{1}} @@ -218,8 +250,8 @@ Method Description Cell met upper group of data values defined by the upper tenth of their distribution -``'variance'`` The unweighted variance of :math:`N` ``variance`` - values :math:`x_i` and with +``'variance'`` The unweighted variance of the :math:`N` ``variance`` + non-missing values :math:`x_i` and with :math:`N-ddof` degrees of freedom (:math:`ddof\ge0`) is @@ -236,9 +268,10 @@ Method Description Cell met :math:`ddof=1`. The :ref:`weighted ` - biased estimate of the variance of - :math:`N` values :math:`x_i` with - corresponding weights :math:`w_i` is + biased estimate of the variance of the + :math:`N` non-missing values :math:`x_i` + with corresponding weights :math:`w_i` + is .. math:: \hat{s}_{N}^{2}= \frac{1}{V_{1}} @@ -246,9 +279,11 @@ Method Description Cell met w_i(x_i - \hat{\mu})^{2} - The corresponding :ref:`weighted - ` unbiased estimate of - the variance is + The :ref:`weighted ` + unbiased estimate of the variance of the + :math:`N` non-missing values :math:`x_i` + with corresponding weights :math:`w_i` + is .. math:: \hat{s}^{2}=\frac{1}{V_{1} - (V_{1}/V_{2})} @@ -256,28 +291,31 @@ Method Description Cell met w_i(x_i - \hat{\mu})^{2} - In both cases, the weights are assumed - to be non-random reliability weights, as - opposed to frequency weights. + .. note:: The weights used in the + variance calculations are + assumed to be non-random + reliability weights, as + opposed to frequency weights. ``'standard_deviation'`` The standard deviation is the square ``standard_deviation`` root of the unweighted or :ref:`weighted ` - variance, as defined in this table. + variance. -``'root_mean_square'`` The unweighted root mean square of ``root_mean_square`` - :math:`N` values :math:`x_i` is +``'root_mean_square'`` The unweighted root mean square of the ``root_mean_square`` + :math:`N` non-missing values + :math:`x_i` is - .. math:: RMS=\sqrt{\frac{1}{N} + .. math:: r=\sqrt{\frac{1}{N} \sum_{i=1}^{N} x_{i}^2} The :ref:`weighted ` - root mean square of :math:`N` values - :math:`x_i` with corresponding weights - :math:`w_i` is + root mean square of the :math:`N` + non-missing values :math:`x_i` with + corresponding weights :math:`w_i` is - .. math:: \hat{RMS}=\sqrt{ + .. math:: \hat{r}=\sqrt{ \frac{1}{V_{1}} \sum_{i=1}^{N} w_i x_{i}^2} From 9a5624ccffcc5aa274c68443c72802a8619fb6fa Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 17 Mar 2022 19:11:45 +0000 Subject: [PATCH 15/37] dev --- cf/data/collapse.py | 521 ++++++++++++++++++++----------------------- cf/data/data.py | 13 +- cf/test/test_Data.py | 1 + docs/Makefile | 2 +- 4 files changed, 253 insertions(+), 284 deletions(-) diff --git a/cf/data/collapse.py b/cf/data/collapse.py index 52d34e95d1..08e9a0f8fa 100644 --- a/cf/data/collapse.py +++ b/cf/data/collapse.py @@ -1,3 +1,4 @@ +"""Functions used during `Data` object collapses.""" import inspect from functools import partial, reduce from operator import mul @@ -52,8 +53,8 @@ def __docstring_package_depth__(self): """ return 0 - @staticmethod - def max(a, axis=None, keepdims=False, mtol=None, split_every=None): + @classmethod + def max(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): """Return maximum values of an array. Calculates the maximum value of an array or the maximum values @@ -99,8 +100,10 @@ def max(a, axis=None, keepdims=False, mtol=None, split_every=None): meta=np.array((), dtype=dtype), ) - @staticmethod - def max_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): + @classmethod + def max_abs( + cls, a, axis=None, keepdims=False, mtol=None, split_every=None + ): """Return maximum absolute values of an array. Calculates the maximum absolute value of an array or the @@ -131,24 +134,23 @@ def max_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): The collapsed array. """ - check_input_dtype(a) - dtype = a.dtype - return reduction( - a, - cf_max_abs_chunk, - partial(cf_max_agg, mtol=mtol, original_shape=a.shape), + return cls.max( + abs(a), axis=axis, keepdims=keepdims, - dtype=dtype, + mtol=mtol, split_every=split_every, - combine=cf_max_combine, - concatenate=False, - meta=np.array((), dtype=dtype), ) - @staticmethod + @classmethod def mean( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, ): """Return mean values of an array. @@ -198,9 +200,15 @@ def mean( weights=weights, ) - @staticmethod + @classmethod def mean_abs( - a, weights=None, axis=None, keepdims=False, mtol=None, split_every=None + cls, + a, + weights=None, + axis=None, + keepdims=False, + mtol=None, + split_every=None, ): """Return mean absolute values of an array. @@ -234,25 +242,24 @@ def mean_abs( The collapsed array. """ - check_input_dtype(a) - dtype = "f8" - return reduction( - a, - cf_mean_abs_chunk, - partial(cf_mean_agg, mtol=mtol, original_shape=a.shape), + return cls.mean( + abs(a), + weights=weights, axis=axis, keepdims=keepdims, - dtype=dtype, + mtol=mtol, split_every=split_every, - combine=cf_mean_combine, - concatenate=False, - meta=np.array((), dtype=dtype), - weights=weights, ) - @staticmethod + @classmethod def mid_range( - a, axis=None, dtype=None, keepdims=False, mtol=None, split_every=None + cls, + a, + axis=None, + dtype=None, + keepdims=False, + mtol=None, + split_every=None, ): """Return mid-range values of an array. @@ -284,7 +291,7 @@ def mid_range( The collapsed array. """ - check_input_dtype(a, allow="fi") + check_input_dtype(a, allowed="fi") dtype = "f8" return reduction( a, @@ -299,8 +306,8 @@ def mid_range( meta=np.array((), dtype=dtype), ) - @staticmethod - def min(a, axis=None, keepdims=False, mtol=None, split_every=None): + @classmethod + def min(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): """Return minimum values of an array. Calculates the minimum value of an array or the minimum values @@ -346,8 +353,10 @@ def min(a, axis=None, keepdims=False, mtol=None, split_every=None): meta=np.array((), dtype=dtype), ) - @staticmethod - def min_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): + @classmethod + def min_abs( + cls, a, axis=None, keepdims=False, mtol=None, split_every=None + ): """Return minimum absolute values of an array. Calculates the minimum absolute value of an array or the @@ -378,23 +387,16 @@ def min_abs(a, axis=None, keepdims=False, mtol=None, split_every=None): The collapsed array. """ - check_input_dtype(a) - dtype = a.dtype - return reduction( - a, - cf_min_abs_chunk, - partial(cf_min_agg, mtol=mtol, original_shape=a.shape), + return cls.min( + abs(a), axis=axis, keepdims=keepdims, - dtype=dtype, + mtol=mtol, split_every=split_every, - combine=cf_min_combine, - concatenate=False, - meta=np.array((), dtype=dtype), ) - @staticmethod - def range(a, axis=None, keepdims=False, mtol=None, split_every=None): + @classmethod + def range(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): """Return range values of an array. Calculates the range value of an array or the range values @@ -425,7 +427,7 @@ def range(a, axis=None, keepdims=False, mtol=None, split_every=None): The collapsed array. """ - check_input_dtype(a, allow="fi") + check_input_dtype(a, allowed="fi") dtype = a.dtype return reduction( a, @@ -440,9 +442,15 @@ def range(a, axis=None, keepdims=False, mtol=None, split_every=None): meta=np.array((), dtype=dtype), ) - @staticmethod + @classmethod def rms( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, ): """Return root mean square (RMS) values of an array. @@ -492,8 +500,10 @@ def rms( weights=weights, ) - @staticmethod - def sample_size(a, axis=None, keepdims=False, mtol=None, split_every=None): + @classmethod + def sample_size( + cls, a, axis=None, keepdims=False, mtol=None, split_every=None + ): """Return sample size values of an array. Calculates the sample size value of an array or the sample @@ -539,9 +549,15 @@ def sample_size(a, axis=None, keepdims=False, mtol=None, split_every=None): meta=np.array((), dtype=dtype), ) - @staticmethod + @classmethod def sum( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, ): """Return sum values of an array. @@ -595,9 +611,15 @@ def sum( weights=weights, ) - @staticmethod + @classmethod def sum_of_weights( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, ): """Return sum of weights values for an array. @@ -647,9 +669,15 @@ def sum_of_weights( weights=weights, ) - @staticmethod + @classmethod def sum_of_weights2( - a, axis=None, weights=None, keepdims=False, mtol=None, split_every=None + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, ): """Return sum of squares of weights values for an array. @@ -687,7 +715,7 @@ def sum_of_weights2( dtype = "f8" return reduction( a, - partial(cf_sum_of_weights_chunk, squared=True), + partial(cf_sum_of_weights_chunk, square=True), partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -699,8 +727,9 @@ def sum_of_weights2( weights=weights, ) - @staticmethod + @classmethod def var( + cls, a, axis=None, weights=None, @@ -760,7 +789,7 @@ def var( ) -def check_input_dtype(a, allow="fib"): +def check_input_dtype(a, allowed="fib"): """Check that data has a data type allowed by a collapse method. The collapse method is assumed to be defined by the calling @@ -771,7 +800,7 @@ def check_input_dtype(a, allow="fib"): a: `dask.array.Array` The data. - allow: `str`, optional + allowed: `str`, optional The data type kinds allowed by the collapse method. Defaults to ``'fib'``, meaning that only float, integer and Boolean data types are allowed. @@ -781,7 +810,7 @@ def check_input_dtype(a, allow="fib"): `None` """ - if a.dtype.kind not in allow: + if a.dtype.kind not in allowed: method = inspect.currentframe().f_back.f_code.co_name raise TypeError(f"Can't calculate {method} of data with {a.dtype!r}") @@ -887,7 +916,7 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): def sum_weights_chunk( - x, weights=None, squared=False, N=None, dtype="f8", **kwargs + x, weights=None, square=False, N=None, dtype="f8", **kwargs ): """Sum the weights. @@ -896,7 +925,7 @@ def sum_weights_chunk( :Parameters: x: `numpy.ndarray` - The collapsed data. + The data. weights: `numpy.ndarray`, optional The weights associated with values of the data. Must have @@ -907,7 +936,7 @@ def sum_weights_chunk( missing data, or that correspond to the missing elements of the data, are assigned a weight of 0. - squared: `bool`, optional + square: `bool`, optional If True calculate the sum of the squares of the weights. N: `numpy.ndarray`, optional @@ -931,7 +960,7 @@ def sum_weights_chunk( return N - if squared: + if square: weights = np.multiply(weights, weights, dtype=dtype) if np.ma.is_masked(x): @@ -1001,15 +1030,21 @@ def min_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): ) -def sum_sample_sizes(pairs, axis, **kwargs): +def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs): """Alias of `combine_arrays` with ``key="N", func=chunk.sum, - dtype="i8", computing_meta=False``. + dtype="i8"``. .. versionadded:: TODODASK """ return combine_arrays( - pairs, "N", chunk.sum, axis, dtype="i8", computing_meta=False, **kwargs + pairs, + "N", + chunk.sum, + axis, + dtype="i8", + computing_meta=computing_meta, + **kwargs, ) @@ -1019,23 +1054,25 @@ def sum_sample_sizes(pairs, axis, **kwargs): def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): """Chunk calculations for the mean. - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. + This function is passed to `dask.array.reduction` as its *chunk* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: `dict` Dictionary with the keys: + * N: The sample size. * V1: The sum of ``weights`` (equal to ``N`` if weights are not set). * sum: The weighted sum of ``x``. + * weighted: True if weights have been set. """ if computing_meta: @@ -1045,7 +1082,6 @@ def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) d["V1"] = sum_weights_chunk(x, weights, N=d["N"], **kwargs) - d["weighted"] = weights is not None return d @@ -1058,13 +1094,16 @@ def cf_mean_combine( computing_meta=False, **kwargs, ): - """Combine calculations for the mean. + """Combination calculations for the mean. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: @@ -1074,18 +1113,16 @@ def cf_mean_combine( if not isinstance(pairs, list): pairs = [pairs] - d = {"weighted": next(flatten(pairs))["weighted"]} + weighted = next(flatten(pairs))["weighted"] + d = {"weighted": weighted} d["sum"] = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) if computing_meta: return d["sum"] d["N"] = sum_sample_sizes(pairs, axis, **kwargs) - - if d["weighted"]: - d["V1"] = sum_arrays( - pairs, "V1", axis, dtype, computing_meta, **kwargs - ) + if weighted: + d["V1"] = sum_arrays(pairs, "V1", axis, dtype, **kwargs) else: d["V1"] = d["N"] @@ -1101,9 +1138,9 @@ def cf_mean_agg( original_shape=None, **kwargs, ): - """Aggregate calculations for the mean. + """Aggregation calculations for the mean. - This function is passed to `dask.array.reduction` as callable + This function is passed to `dask.array.reduction` as its *aggregate* parameter. .. versionadded:: TODODASK @@ -1118,7 +1155,8 @@ def cf_mean_agg( original_shape: `tuple` The shape of the original, uncollapsed data. - See `dask.array.reductions` for further details. + See `dask.array.reductions` for details of the other + parameters. :Returns: @@ -1135,58 +1173,26 @@ def cf_mean_agg( return x -# -------------------------------------------------------------------- -# mean_absolute_value -# -------------------------------------------------------------------- -def cf_mean_abs_chunk( - x, weights=None, dtype=None, computing_meta=False, **kwargs -): - """Chunk calculations for the mean of the absolute values. - - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. - - .. versionadded:: TODODASK - - :Parameters: - - See `dask.array.reductions` for details. - - :Returns: - - `dict` - Dictionary with the keys: - * N: The sample size. - * V1: The sum of ``weights`` (equal to ``N`` if weights - are not set). - * sum: The weighted sum of ``abs(x)``. - - """ - if computing_meta: - return x - - return cf_mean_chunk(np.abs(x), weights, dtype=dtype, **kwargs) - - # -------------------------------------------------------------------- # maximum # -------------------------------------------------------------------- def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the maximum. - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. + This function is passed to `dask.array.reduction` as its *chunk* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: `dict` Dictionary with the keys: + * N: The sample size. * max: The maximum of `x``. @@ -1206,13 +1212,16 @@ def cf_max_combine( computing_meta=False, **kwargs, ): - """Combine calculations for the maximum. + """Combination calculations for the maximum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: @@ -1222,14 +1231,12 @@ def cf_max_combine( if not isinstance(pairs, list): pairs = [pairs] - # Create a nested list of maxima and recursively concatenate it - # along the specified axes - m = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) + mx = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) if computing_meta: - return m + return mx return { - "max": m, + "max": mx, "N": sum_sample_sizes(pairs, axis, **kwargs), } @@ -1242,9 +1249,9 @@ def cf_max_agg( original_shape=None, **kwargs, ): - """Aggregate calculations for the maximum. + """Aggregation calculations for the maximum. - This function is passed to `dask.array.reduction` as callable + This function is passed to `dask.array.reduction` as its *aggregate* parameter. .. versionadded:: TODODASK @@ -1259,7 +1266,8 @@ def cf_max_agg( original_shape: `tuple` The shape of the original, uncollapsed data. - See `dask.array.reductions` for further details. + See `dask.array.reductions` for details of the other + parameters. :Returns: @@ -1276,35 +1284,6 @@ def cf_max_agg( return x -# -------------------------------------------------------------------- -# maximum_absolute_value -# -------------------------------------------------------------------- -def cf_max_abs_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Chunk calculations for the maximum of absolute values. - - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. - - .. versionadded:: TODODASK - - :Parameters: - - See `dask.array.reductions` for details. - - :Returns: - - `dict` - Dictionary with the keys: - * N: The sample size. - * max: The maximum of ``abs(x)``. - - """ - if computing_meta: - return x - - return cf_max_chunk(np.abs(x), **kwargs) - - # -------------------------------------------------------------------- # mid-range # -------------------------------------------------------------------- @@ -1317,9 +1296,9 @@ def cf_mid_range_agg( original_shape=None, **kwargs, ): - """Aggregate calculations for the mid-range. + """Aggregation calculations for the mid-range. - This function is passed to `dask.array.reduction` as callable + This function is passed to `dask.array.reduction` as its *aggregate* parameter. .. versionadded:: TODODASK @@ -1334,7 +1313,8 @@ def cf_mid_range_agg( original_shape: `tuple` The shape of the original, uncollapsed data. - See `dask.array.reductions` for further details. + See `dask.array.reductions` for details of the other + parameters. :Returns: @@ -1358,19 +1338,20 @@ def cf_mid_range_agg( def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the minimum. - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. + This function is passed to `dask.array.reduction` as its *chunk* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: `dict` Dictionary with the keys: + * N: The sample size. * min: The minimum of ``x``. @@ -1390,13 +1371,16 @@ def cf_min_combine( computing_meta=False, **kwargs, ): - """Combine calculations for the minimum. + """Combination calculations for the minimum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: @@ -1406,14 +1390,12 @@ def cf_min_combine( if not isinstance(pairs, list): pairs = [pairs] - # Create a nested list of maxima and recursively concatenate it - # along the specified axes - x = min_arrays(pairs, "min", axis, None, computing_meta, **kwargs) + mn = min_arrays(pairs, "min", axis, None, computing_meta, **kwargs) if computing_meta: - return x + return mn return { - "min": x, + "min": mn, "N": sum_sample_sizes(pairs, axis, **kwargs), } @@ -1426,9 +1408,9 @@ def cf_min_agg( original_shape=None, **kwargs, ): - """Aggregate calculations for the minimum. + """Aggregation calculations for the minimum. - This function is passed to `dask.array.reduction` as callable + This function is passed to `dask.array.reduction` as its *aggregate* parameter. .. versionadded:: TODODASK @@ -1443,7 +1425,8 @@ def cf_min_agg( original_shape: `tuple` The shape of the original, uncollapsed data. - See `dask.array.reductions` for further details. + See `dask.array.reductions` for details of the other + parameters. :Returns: @@ -1460,54 +1443,26 @@ def cf_min_agg( return x -# -------------------------------------------------------------------- -# minimum absolute value -# -------------------------------------------------------------------- -def cf_min_abs_chunk(x, dtype=None, computing_meta=False, **kwargs): - """Chunk calculations for the minimum of absolute values. - - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. - - .. versionadded:: TODODASK - - :Parameters: - - See `dask.array.reductions` for details. - - :Returns: - - `dict` - Dictionary with the keys: - * N: The sample size. - * min: The minimum of ``abs(x)``. - - """ - if computing_meta: - return x - - return cf_min_chunk(np.abs(x), **kwargs) - - # -------------------------------------------------------------------- # range # -------------------------------------------------------------------- def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the range. - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. + This function is passed to `dask.array.reduction` as its *chunk* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: `dict` Dictionary with the keys: + * N: The sample size. * min: The minimum of ``x``. * max: The maximum of ``x`. @@ -1516,7 +1471,9 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x + # N, max d = cf_max_chunk(x, **kwargs) + d["min"] = chunk.min(x, **kwargs) return d @@ -1528,13 +1485,16 @@ def cf_range_combine( computing_meta=False, **kwargs, ): - """Combine calculations for the range. + """Combination calculations for the range. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: @@ -1544,13 +1504,11 @@ def cf_range_combine( if not isinstance(pairs, list): pairs = [pairs] - # Create a nested list of maxima and recursively concatenate it - # along the specified axes mx = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) if computing_meta: return mx - mn = min_arrays(pairs, "min", axis, None, computing_meta, **kwargs) + mn = min_arrays(pairs, "min", axis, None, **kwargs) return { "max": mx, @@ -1567,9 +1525,9 @@ def cf_range_agg( original_shape=None, **kwargs, ): - """Aggregate calculations for the range. + """Aggregation calculations for the range. - This function is passed to `dask.array.reduction` as callable + This function is passed to `dask.array.reduction` as its *aggregate* parameter. .. versionadded:: TODODASK @@ -1584,7 +1542,8 @@ def cf_range_agg( original_shape: `tuple` The shape of the original, uncollapsed data. - See `dask.array.reductions` for further details. + See `dask.array.reductions` for details of the other + parameters. :Returns: @@ -1608,19 +1567,20 @@ def cf_range_agg( def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): """Chunk calculations for the root mean square (RMS).. - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. + This function is passed to `dask.array.reduction` as its *chunk* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: `dict` Dictionary with the keys: + * N: The sample size. * sum: The weighted sum of ``x**2``. @@ -1642,9 +1602,9 @@ def cf_rms_agg( original_shape=None, **kwargs, ): - """Aggregate calculations for the root mean square (RMS). + """Aggregation calculations for the root mean square (RMS). - This function is passed to `dask.array.reduction` as callable + This function is passed to `dask.array.reduction` as its *aggregate* parameter. .. versionadded:: TODODASK @@ -1659,7 +1619,8 @@ def cf_rms_agg( original_shape: `tuple` The shape of the original, uncollapsed data. - See `dask.array.reductions` for further details. + See `dask.array.reductions` for details of the other + parameters. :Returns: @@ -1682,19 +1643,20 @@ def cf_rms_agg( def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): """Chunk calculations for the sample size. - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. + This function is passed to `dask.array.reduction` as its *chunk* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: `dict` Dictionary with the keys: + * N: The sample size. """ @@ -1719,13 +1681,16 @@ def cf_sample_size_combine( computing_meta=False, **kwargs, ): - """Combine calculations for the sample size. + """Combination calculations for the sample size. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: @@ -1751,9 +1716,9 @@ def cf_sample_size_agg( original_shape=None, **kwargs, ): - """Aggregate calculations for the sample size. + """Aggregation calculations for the sample size. - This function is passed to `dask.array.reduction` as callable + This function is passed to `dask.array.reduction` as its *aggregate* parameter. .. versionadded:: TODODASK @@ -1768,7 +1733,8 @@ def cf_sample_size_agg( original_shape: `tuple` The shape of the original, uncollapsed data. - See `dask.array.reductions` for further details. + See `dask.array.reductions` for details of the other + parameters. :Returns: @@ -1791,19 +1757,20 @@ def cf_sample_size_agg( def cf_sum_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): """Chunk calculations for the sum. - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. + This function is passed to `dask.array.reduction` as its *chunk* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: `dict` Dictionary with the keys: + * N: The sample size. * sum: The weighted sum of ``x`` @@ -1826,13 +1793,16 @@ def cf_sum_combine( computing_meta=False, **kwargs, ): - """Combine calculations for the sum. + """Combination calculations for the sum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: @@ -1842,8 +1812,6 @@ def cf_sum_combine( if not isinstance(pairs, list): pairs = [pairs] - # Create a nested list of maxima and recursively concatenate it - # along the specified axes x = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) if computing_meta: return x @@ -1863,9 +1831,9 @@ def cf_sum_agg( original_shape=None, **kwargs, ): - """Aggregate calculations for the sum. + """Aggregation calculations for the sum. - This function is passed to `dask.array.reduction` as callable + This function is passed to `dask.array.reduction` as its *aggregate* parameter. .. versionadded:: TODODASK @@ -1880,7 +1848,8 @@ def cf_sum_agg( original_shape: `tuple` The shape of the original, uncollapsed data. - See `dask.array.reductions` for further details. + See `dask.array.reductions` for details of the other + parameters. :Returns: @@ -1901,28 +1870,30 @@ def cf_sum_agg( # sum of weights # -------------------------------------------------------------------- def cf_sum_of_weights_chunk( - x, weights=None, dtype="f8", computing_meta=False, squared=False, **kwargs + x, weights=None, dtype="f8", computing_meta=False, square=False, **kwargs ): """Chunk calculations for the sum of the weights. - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. + This function is passed to `dask.array.reduction` as its *chunk* + parameter. :Parameters: - squared: `bool`, optional + square: `bool`, optional If True then calculate the sum of the squares of the weights. - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the other + parameters. :Returns: `dict` Dictionary with the keys: + * N: The sample size. * sum: The sum of ``weights``, or the sum of - ``weights**2`` if *squared* is True. + ``weights**2`` if *square* is True. """ if computing_meta: @@ -1932,7 +1903,7 @@ def cf_sum_of_weights_chunk( d = cf_sample_size_chunk(x, **kwargs) d["sum"] = sum_weights_chunk( - x, weights=weights, squared=squared, N=d["N"], **kwargs + x, weights=weights, square=square, N=d["N"], **kwargs ) return d @@ -1946,8 +1917,8 @@ def cf_var_chunk( ): """Chunk calculations for the variance. - This function is passed to `dask.array.reduction` as callable - *chunk* parameter. + This function is passed to `dask.array.reduction` as its *chunk* + parameter. See https://en.wikipedia.org/wiki/Pooled_variance#Sample-based_statistics @@ -1963,7 +1934,8 @@ def cf_var_chunk( represents the number of non-missing elements. A value of 1 applies Bessel's correction. - See `dask.array.reductions` for further details. + See `dask.array.reductions` for details of the other + parameters. :Returns: @@ -1973,7 +1945,8 @@ def cf_var_chunk( * N: The sample size. * V1: The sum of ``weights`` (equal to ``N`` if weights are not set). - * V2: The sum of ``weights**2``. + * V2: The sum of ``weights**2``, or `None` of not + required. * sum: The weighted sum of ``x``. * part: ``V1 * (sigma**2 + mu**2)``, where ``sigma**2`` is the weighted biased (i.e. ``ddof=0``) variance of @@ -1985,6 +1958,8 @@ def cf_var_chunk( if computing_meta: return x + weighted = weights is not None + # N, V1, sum d = cf_mean_chunk(x, weights, dtype=dtype, **kwargs) @@ -1994,7 +1969,7 @@ def cf_var_chunk( avg = divide(wsum, V1, dtype=dtype) part = x - avg part *= part - if weights is not None: + if weighted: part = part * weights part = chunk.sum(part, dtype=dtype, **kwargs) @@ -2002,14 +1977,12 @@ def cf_var_chunk( d["part"] = part - if ddof == 1: - d["V2"] = sum_weights_chunk( - x, weights, squared=True, N=d["N"], **kwargs - ) + if weighted and ddof == 1: + d["V2"] = sum_weights_chunk(x, weights, square=True, **kwargs) else: - d["V2"] = d["N"] + d["V2"] = None - d["weighted"] = weights is not None + d["weighted"] = weighted d["ddof"] = ddof return d @@ -2022,13 +1995,16 @@ def cf_var_combine( computing_meta=False, **kwargs, ): - """Combine calculations for the variance. + """Combination calculations for the variance. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. .. versionadded:: TODODASK :Parameters: - See `dask.array.reductions` for details. + See `dask.array.reductions` for details of the parameters. :Returns: @@ -2041,7 +2017,6 @@ def cf_var_combine( d = next(flatten(pairs)) weighted = d["weighted"] ddof = d["ddof"] - d = {"weighted": weighted, "ddof": ddof} d["part"] = sum_arrays( @@ -2050,20 +2025,15 @@ def cf_var_combine( if computing_meta: return d["part"] - d["N"] = sum_sample_sizes(pairs, axis, **kwargs) - - d["sum"] = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) + d["sum"] = sum_arrays(pairs, "sum", axis, dtype, **kwargs) + d["N"] = sum_sample_sizes(pairs, axis, **kwargs) d["V1"] = d["N"] - d["V2"] = d["N"] + d["V2"] = None if weighted: - d["V1"] = sum_arrays( - pairs, "V1", axis, dtype, computing_meta, **kwargs - ) + d["V1"] = sum_arrays(pairs, "V1", axis, dtype, **kwargs) if ddof == 1: - d["V2"] = sum_arrays( - pairs, "V2", axis, dtype, computing_meta, **kwargs - ) + d["V2"] = sum_arrays(pairs, "V2", axis, dtype, **kwargs) return d @@ -2077,9 +2047,9 @@ def cf_var_agg( original_shape=None, **kwargs, ): - """Aggregate calculations for the variance. + """Aggregation calculations for the variance. - This function is passed to `dask.array.reduction` as callable + This function is passed to `dask.array.reduction` as its *aggregate* parameter. .. note:: Weights are interpreted as reliability weights, as @@ -2101,7 +2071,8 @@ def cf_var_agg( original_shape: `tuple` The shape of the original, uncollapsed data. - See `dask.array.reductions` for further details. + See `dask.array.reductions` for details of the other + parameters. :Returns: diff --git a/cf/data/data.py b/cf/data/data.py index ace84ceedd..e101d79f07 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -68,7 +68,7 @@ from .mixin import DataClassDeprecationsMixin from .partition import Partition from .partitionmatrix import PartitionMatrix -from .utils import ( # is_small,; is_very_small,; collapse, +from .utils import ( # is_small,; is_very_small, YMDhms, _is_numeric_dtype, conform_units, @@ -6763,7 +6763,6 @@ def argmax(self, axis=None, unravel=False): is located. unravel: `bool`, optional - If True then when locating the maximum over the whole data, return the location as an index for each axis as a `tuple`. By default an index to the flattened array @@ -12872,9 +12871,8 @@ def square(self, dtype=None, inplace=False): :Returns: `Data` or `None` - The element-wise positive square root of the data - collapsed data, or `None` if the operation was - in-place. + The element-wise square of the data, or `None` if the + operation was in-place. **Examples** @@ -12921,9 +12919,8 @@ def sqrt(self, dtype=None, inplace=False): :Returns: `Data` or `None` - The element-wise positive square root of the data - collapsed data, or `None` if the operation was - in-place. + The element-wise positive square root of the data, or + `None` if the operation was in-place. **Examples** diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 74c35cad98..421dd04bb9 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -3474,6 +3474,7 @@ def test_Data_minimum_absolute_value(self): def test_Data_range(self): # Masked array a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) for axis in axis_combinations(a): diff --git a/docs/Makefile b/docs/Makefile index e101c0f118..6850180e1d 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -2,7 +2,7 @@ # # You can set these variables from the command line. -SPHINXOPTS = -j 2 +SPHINXOPTS = -j 1 SPHINXBUILD = sphinx-build PAPER = #BUILDDIR = build From c2cf274835e82a0f6ff0d84fecba1d40a6420b05 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 18 Mar 2022 09:32:53 +0000 Subject: [PATCH 16/37] Fix bug in cf_percentile mtol inequality --- cf/data/dask_utils.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 810d88cbc0..73dc7aed8e 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -262,16 +262,22 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): original array *a*. mtol: number, optional - Set an upper limit of the amount input data values which - are allowed to be missing data when contributing to - individual output percentile values. It is defined as a - fraction (between 0 and 1 inclusive) of the contributing - input data values. The default is 1, meaning that a - missing datum in the output array only occurs when all of - its contributing input array elements are missing data. A - value of 0 means that a missing datum in the output array - occurs whenever any of its contributing input array - elements are missing data. + The sample size threshold below which collapsed values are + set to missing data. It is defined as a fraction (between + 0 and 1 inclusive) of the contributing input data values. + + The default of *mtol* is 1, meaning that a missing datum + in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the input + data. :Returns: @@ -299,7 +305,7 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): a, axis=axis, keepdims=keepdims ) if n_missing.any(): - mask = np.where(n_missing >= mtol * full_size, True, False) + mask = np.where(n_missing > mtol * full_size, True, False) if q.ndim: mask = np.expand_dims(mask, 0) From f59feb7fd68487423c04c9bae1e5c21b29c37000 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 18 Mar 2022 10:46:27 +0000 Subject: [PATCH 17/37] dev --- cf/data/data.py | 104 +++++++----- cf/test/test_Data.py | 371 ++++++++++++++++--------------------------- 2 files changed, 199 insertions(+), 276 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index e101d79f07..e0356c33f0 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -2206,7 +2206,7 @@ def percentile( [[-- -- -- --] [-- -- -- --] [-- 9 10 11]] - >>> e.sd() + >>> e.std() Find the mean of the values above the 45th percentile along the @@ -7101,8 +7101,8 @@ def min( axes=None, squeeze=False, mtol=1, - inplace=False, split_every=None, + inplace=False, i=False, _preserve_partitions=False, ): @@ -7241,8 +7241,8 @@ def mean( weights=None, squeeze=False, mtol=1, - inplace=False, split_every=None, + inplace=False, i=False, ): """Calculate mean values. @@ -10643,7 +10643,7 @@ def stats( sum=False, sum_of_squares=False, variance=False, - weights=False, + weights=None, ): """Calculate statistics of the data. @@ -10720,17 +10720,7 @@ def stats( Calculate the square root of the weighted or unweighted mean of the squares of the values. - weights: data-like or dict, optional - The weights to apply to the calculations. By default the - statistics are unweighted. - - The weights may be contained in any scalar or array-like - object (such as a numpy array or `Data` instance) that is - broadcastable to the shape of the data. If *weights* is a - dictionary then each key is axes of the array (an `int` or - `tuple` of `int`) with a corresponding data-like value of - weights for those axes. In this case, the implied weights - array is the outer product of the dictionary's values. + {{weights: data_like, `dict`, or `None`, optional}} :Returns: @@ -10782,15 +10772,14 @@ def stats( 'sample_size': 5} """ - no_weights = ( "minimum", + "median", "maximum", "range", "mid_range", "minimum_absolute_value", "maximum_absolute_value", - "median", "sum", "sum_of_squares", ) @@ -10814,14 +10803,13 @@ def stats( "variance", ): if all or locals()[stat]: - f = getattr(self, stat) + func = getattr(self, stat) if stat in no_weights: - value = f(squeeze=True) + value = func(squeeze=True) else: - value = f(squeeze=True, weights=weights) + value = func(squeeze=True, weights=weights) out[stat] = value - # --- End: for if all or sample_size: out["sample_size"] = int(self.sample_size()) @@ -12264,8 +12252,8 @@ def sum( weights=None, squeeze=False, mtol=1, - inplace=False, split_every=None, + inplace=False, i=False, ): """Calculate sum values. @@ -12615,13 +12603,13 @@ def sum_of_weights2( @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) - def sd( + def std( self, axes=None, squeeze=False, mtol=1, weights=None, - ddof=None, + ddof=0, split_every=None, inplace=False, i=False, @@ -12649,6 +12637,8 @@ def sd( {{ddof: number}} + By default *ddof* is 0. + {{split_every: `int` or `dict`, optional}} .. versionadded:: TODODASK @@ -12673,15 +12663,15 @@ def sd( [3 -- 5] [6 7 8] [9 10 11]] - >>> d.sd(ddof=0) + >>> d.std() - >>> d.sd(ddof=1) + >>> d.std(ddof=1) >>> w = np.linspace(1, 2, 3) >>> print(w) [1. 1.5 2. ] - >>> d.sd(ddof=1, weights=w) + >>> d.std(ddof=1, weights=w) """ @@ -12707,9 +12697,9 @@ def var( weights=None, squeeze=False, mtol=1, - ddof=None, - inplace=False, + ddof=0, split_every=None, + inplace=False, i=False, ): """Calculate variances. @@ -12735,6 +12725,8 @@ def var( {{ddof: number}} + By default *ddof* is 0. + {{split_every: `int` or `dict`, optional}} .. versionadded:: TODODASK @@ -12759,7 +12751,7 @@ def var( [3 -- 5] [6 7 8] [9 10 11]] - >>> d.var(ddof=0) + >>> d.var() >>> d.var(ddof=1) @@ -12771,9 +12763,6 @@ def var( """ - if ddof is None: - raise ValueError("Must set the delta degrees of freedom (ddof)") - d = _inplace_enabled_define_and_cleanup(self) d, _ = _collapse( Collapse.var, @@ -13019,6 +13008,32 @@ def minimum( i=i, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def sd( + self, + axes=None, + squeeze=False, + mtol=1, + weights=None, + ddof=0, + split_every=None, + inplace=False, + i=False, + ): + """Alias for `std`""" + return self.sdt( + axes=axes, + squeeze=squeeze, + weights=weights, + mtol=mtol, + ddof=ddof, + split_every=split_every, + inplace=inplace, + i=i, + ) + @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") @@ -13028,17 +13043,19 @@ def standard_deviation( squeeze=False, mtol=1, weights=None, - ddof=None, + ddof=0, + split_every=None, inplace=False, i=False, ): - """Alias for `sd`""" - return self.sd( + """Alias for `std`""" + return self.std( axes=axes, squeeze=squeeze, weights=weights, mtol=mtol, ddof=ddof, + split_every=split_every, inplace=inplace, i=i, ) @@ -13052,7 +13069,8 @@ def variance( squeeze=False, weights=None, mtol=1, - ddof=None, + ddof=0, + split_every=None, inplace=False, i=False, ): @@ -13063,6 +13081,7 @@ def variance( weights=weights, mtol=mtol, ddof=ddof, + split_every=split_every, inplace=inplace, i=i, ) @@ -13200,7 +13219,7 @@ def _collapse( ddof=None, split_every=None, ): - """Collapse data using a given funcion. + """Collapse data in-place using a given funcion. .. versionadded:: TODODASK @@ -13285,7 +13304,7 @@ def _collapse( :Returns: - `Data`, formatted weights + (`Data`, formatted weights) The collapsed data and the output of ``_parse_weights(d, weights, axis)``. @@ -13297,8 +13316,9 @@ def _collapse( "mtol": mtol, } + weights = _parse_weights(d, weights, axis) if weights is not None: - kwargs["weights"] = _parse_weights(d, weights, axis) + kwargs["weights"] = weights if ddof is not None: kwargs["ddof"] = ddof @@ -13375,6 +13395,10 @@ def _parse_weights(d, weights, axis=None): None """ + if weights is None: + # No weights + return + if not isinstance(weights, dict): # Weights is data_like. Don't check broadcastability to d, # leave that to whatever uses the weights. diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 421dd04bb9..1fe4d4abea 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -717,16 +717,25 @@ def test_Data_compressed(self): d = cf.Data(self.ma, "km") self.assertTrue((self.ma.compressed() == d.compressed()).all()) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_shape'") + @unittest.skipIf(TEST_DASKIFIED_ONLY, "Needs __eq__") def test_Data_stats(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return + d = cf.Data([1, 1]) - d = cf.Data([[0, 1, 2], [3, -99, 5]], mask=[[0, 0, 0], [0, 1, 0]]) - - self.assertIsInstance(d.stats(), dict) - _ = d.stats(all=True) - _ = d.stats(mean_of_upper_decile=True, range=False) + self.assertEqual( + d.stats(sum=True, weights=1), + { + "minimum": 1, + "mean": 1.0, + "median": 1.0, + "maximum": 1, + "range": 0, + "mid_range": 1.0, + "standard_deviation": 0.0, + "root_mean_square": 1.0, + "sum": 2, + "sample_size": 2, + }, + ) @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_shape'") def test_Data__init__dtype_mask(self): @@ -2432,97 +2441,6 @@ def test_Data_argmax(self): with self.assertRaises(Exception): d.argmax(axis=d.ndim) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits 'NoneType' is not iterable") - def test_Data__collapse_SHAPE(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - a = np.arange(-100, 200.0, dtype=float).reshape(3, 4, 5, 5) - - for h in ( - "sample_size", - "sum", - "min", - "max", - "mean", - "var", - "sd", - "mid_range", - "range", - "integral", - "maximum_absolute_value", - "minimum_absolute_value", - "sum_of_squares", - "root_mean_square", - "mean_absolute_value", - "median", - "mean_of_upper_decile", - "sum_of_weights", - "sum_of_weights2", - ): - - d = cf.Data(a[(slice(None, None, -1),) * a.ndim].copy()) - d.flip(inplace=True) - _ = cf.Data(self.w.copy()) - - shape = list(d.shape) - - for axes in self.axes_combinations: - e = getattr(d, h)( - axes=axes, squeeze=False, _preserve_partitions=False - ) - - shape = list(d.shape) - for i in axes: - shape[i] = 1 - - shape = tuple(shape) - self.assertEqual( - e.shape, - shape, - "{}, axes={}, not squeezed bad shape: {} != {}".format( - h, axes, e.shape, shape - ), - ) - - for axes in self.axes_combinations: - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=False - ) - shape = list(d.shape) - for i in sorted(axes, reverse=True): - shape.pop(i) - - shape = tuple(shape) - self.assertEqual( - e.shape, - shape, - "{}, axes={}, squeezed bad shape: {} != {}".format( - h, axes, e.shape, shape - ), - ) - - e = getattr(d, h)(squeeze=True, _preserve_partitions=False) - shape = () - self.assertEqual( - e.shape, - shape, - "{}, axes={}, squeezed bad shape: {} != {}".format( - h, None, e.shape, shape - ), - ) - - e = getattr(d, h)(squeeze=False, _preserve_partitions=False) - shape = (1,) * d.ndim - self.assertEqual( - e.shape, - shape, - "{}, axes={}, not squeezed bad shape: {} != {}".format( - h, None, e.shape, shape - ), - ) - # --- End: for - def test_Data_percentile_median(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -3293,16 +3211,6 @@ def test_Data_integral(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.integral(weights=weights) - self.assertEqual(e.Units, cf.Units("K")) - - e = d.integral(weights=cf.Data(weights, "m"), mtol=0) - self.assertEqual(e.Units, cf.Units("K m")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_max(self): # Masked array a = self.ma @@ -3319,13 +3227,6 @@ def test_Data_max(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.max(mtol=0) - self.assertEqual(e.Units, cf.Units("K")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_maximum_absolute_value(self): # Masked array a = self.ma @@ -3342,13 +3243,6 @@ def test_Data_maximum_absolute_value(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.maximum_absolute_value(mtol=0) - self.assertEqual(e.Units, cf.Units("K")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_mean(self): # Masked array, non-masked weights a = self.ma @@ -3367,13 +3261,6 @@ def test_Data_mean(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.mean(mtol=0) - self.assertEqual(e.Units, cf.Units("K")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_mean_absolute_value(self): # Masked array, non-masked weights a = self.ma @@ -3392,13 +3279,6 @@ def test_Data_mean_absolute_value(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.mean_absolute_value(mtol=0) - self.assertEqual(e.Units, cf.Units("K")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_mid_range(self): # Masked array, non-masked weights a = self.ma @@ -3415,13 +3295,6 @@ def test_Data_mid_range(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.mid_range(mtol=0) - self.assertEqual(e.Units, cf.Units("K")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - with self.assertRaises(TypeError): cf.Data([0, 1], dtype=bool).mid_range() @@ -3441,13 +3314,6 @@ def test_Data_min(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.min(mtol=0) - self.assertEqual(e.Units, cf.Units("K")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_minimum_absolute_value(self): # Masked array a = self.ma @@ -3464,13 +3330,6 @@ def test_Data_minimum_absolute_value(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.minimum_absolute_value(mtol=0) - self.assertEqual(e.Units, cf.Units("K")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_range(self): # Masked array a = self.ma @@ -3488,13 +3347,6 @@ def test_Data_range(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.range(mtol=0) - self.assertEqual(e.Units, cf.Units("K")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - with self.assertRaises(TypeError): cf.Data([0, 1], dtype=bool).range() @@ -3516,13 +3368,6 @@ def test_Data_root_mean_square(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.root_mean_square(mtol=0) - self.assertEqual(e.Units, cf.Units("K")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_sample_size(self): # Masked array a = self.ma @@ -3553,31 +3398,16 @@ def test_Data_sample_size(self): self.assertTrue(np.allclose(e, b)) - # Check units - d = cf.Data(self.ma, "K", chunks=(2, 3, 2, 5)) - e = d.sample_size(mtol=0) - self.assertEqual(e.Units, cf.Units()) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - - def test_Data_sd(self): + def test_Data_std(self): # Masked array, non-masked weights a = self.ma weights = self.w d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) - sd = d.sd(weights=weights, ddof=1) + std = d.std(weights=weights, ddof=1) var = d.var(weights=weights, ddof=1) - self.assertTrue(sd.equals(var.sqrt())) - - # Check units - self.assertEqual(sd.Units, cf.Units("K")) - - # Check mtol - sd = d.sd(ddof=0, mtol=0) - self.assertEqual(sd.array, np.ma.masked) + self.assertTrue(std.equals(var.sqrt())) def test_Data_sum(self): # Masked array, non-masked weights @@ -3597,13 +3427,6 @@ def test_Data_sum(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.sum(mtol=0) - self.assertEqual(e.Units, cf.Units("K")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_sum_of_squares(self): # Masked array, non-masked weights a = self.ma @@ -3622,13 +3445,6 @@ def test_Data_sum_of_squares(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.sum_of_squares(mtol=0) - self.assertEqual(e.Units, cf.Units("K2")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_sum_of_weights(self): # Masked array, non-masked weights a = self.ma @@ -3660,15 +3476,6 @@ def test_Data_sum_of_weights(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.sum_of_weights() - self.assertEqual(e.Units, cf.Units()) - e = d.sum_of_weights(weights=cf.Data(weights, "m"), mtol=0) - self.assertEqual(e.Units, cf.Units("m")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_sum_of_weights2(self): # Masked array, non-masked weights a = self.ma @@ -3694,16 +3501,6 @@ def test_Data_sum_of_weights2(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.sum_of_weights2(weights=weights) - self.assertEqual(e.Units, cf.Units()) - - e = d.sum_of_weights2(weights=cf.Data(weights, "m"), mtol=0) - self.assertEqual(e.Units, cf.Units("m2")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - def test_Data_var(self): # Masked array, non-masked weights a = self.ma @@ -3722,7 +3519,7 @@ def test_Data_var(self): b = b / V1 b = np.ma.asanyarray(b) - e = d.var(axes=axis, weights=weights, ddof=0, squeeze=True) + e = d.var(axes=axis, weights=weights, squeeze=True) e = np.ma.array(e.array) self.assertTrue((e.mask == b.mask).all()) @@ -3763,13 +3560,6 @@ def test_Data_var(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.var(ddof=0, mtol=0) - self.assertEqual(e.Units, cf.Units("K2")) - - # Check mtol - self.assertEqual(e.array, np.ma.masked) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "Needs __lt__ and __le__") def test_Data_mean_of_upper_decile(self): # Masked array, non-masked weights @@ -3808,12 +3598,121 @@ def test_Data_mean_of_upper_decile(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - # Check units - e = d.mean_of_upper_decile(mtol=0) - self.assertEqual(e.Units, cf.Units("K2")) + def test_Data_collapse_mtol(self): + # Data with exactly half of its elements masked + d = cf.Data(np.arange(6), "K", mask=[0, 1, 0, 1, 0, 1], chunks=2) + + for func in ( + d.integral, + d.mean, + d.mean_absolute_value, + d.median, + d.min, + d.mid_range, + d.minimum_absolute_value, + d.max, + d.maximum_absolute_value, + d.range, + d.root_mean_square, + d.sample_size, + d.std, + d.sum, + d.sum_of_squares, + d.sum_of_weights, + d.sum_of_weights2, + d.var, + ): + self.assertTrue(func(mtol=0.4).array.mask) + self.assertFalse(func(mtol=0.5).array.mask) + + # TODODASK - add in mean_of_upper_decile when it's daskified + + def test_Data_collapse_units(self): + d = cf.Data([1, 2], "K") + + self.assertEqual(d.sample_size().Units, cf.Units()) + + for func in ( + d.integral, + d.mean, + d.mean_absolute_value, + d.median, + d.min, + d.mid_range, + d.minimum_absolute_value, + d.max, + d.maximum_absolute_value, + d.range, + d.root_mean_square, + d.std, + d.sum, + ): + self.assertEqual(func().Units, d.Units) + + for func in ( + d.sum_of_squares, + d.var, + ): + self.assertEqual(func().Units, d.Units ** 2) + + for func in ( + d.sum_of_weights, + d.sum_of_weights2, + ): + self.assertEqual(func().Units, cf.Units()) + + # Weighted + w = cf.Data(1, "m") + self.assertEqual(d.integral(weights=w).Units, d.Units * w.Units) + self.assertEqual(d.sum_of_weights(weights=w).Units, w.Units) + self.assertEqual(d.sum_of_weights2(weights=w).Units, w.Units ** 2) + + # Dimensionless data + d = cf.Data([1, 2]) + self.assertEqual(d.integral(weights=w).Units, w.Units) + + for func in ( + d.sum_of_squares, + d.var, + ): + self.assertEqual(func().Units, cf.Units()) + + # TODODASK - add in mean_of_upper_decile when it's daskified + + def test_Data_collapse_keepdims(self): + d = cf.Data(np.arange(6).reshape(2, 3)) + + for func in ( + d.integral, + d.mean, + d.mean_absolute_value, + d.median, + d.min, + d.mid_range, + d.minimum_absolute_value, + d.max, + d.maximum_absolute_value, + d.range, + d.root_mean_square, + d.sample_size, + d.std, + d.sum, + d.sum_of_squares, + d.sum_of_weights, + d.sum_of_weights2, + d.var, + ): + for axis in axis_combinations(d): + e = func(axes=axis, squeeze=False) + s = [1 if i in axis else n for i, n in enumerate(d.shape)] + self.assertEqual(e.shape, tuple(s)) + + for axis in axis_combinations(d): + e = func(axes=axis, squeeze=True) + s = [n for i, n in enumerate(d.shape) if i not in axis] + self.assertEqual(e.shape, tuple(s)) - # Check mtol - self.assertEqual(e.array.item(), np.ma.masked) + # TODODASK - add in mean_of_upper_decile if __name__ == "__main__": From 7320e046d2f14f1e85d61842f98c7ea865040ac3 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 18 Mar 2022 10:52:37 +0000 Subject: [PATCH 18/37] dask v2022.03.0 --- cf/__init__.py | 2 +- requirements.txt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cf/__init__.py b/cf/__init__.py index b1aceef9a6..61c48f6347 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -185,7 +185,7 @@ ) # Check the version of dask -_minimum_vn = "2020.2.1" +_minimum_vn = "2022.03.0" if LooseVersion(dask.__version__) < LooseVersion(_minimum_vn): raise RuntimeError( f"Bad dask version: cf requires dask>={_minimum_vn}. " diff --git a/requirements.txt b/requirements.txt index e254ad5396..c2344d9f87 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ netCDF4>=1.5.4 -cftime>=1.5.0 +cftime>=1.6.0 numpy>=1.22 cfdm>=1.9.0.1, <1.9.1.0 psutil>=0.6.0 cfunits>=3.3.4 -dask>=2022.2.1 +dask>=2022.03.0 From 72b29df5e218afed0d34a8a6d3069e6a02eab049 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 18 Mar 2022 13:17:51 +0000 Subject: [PATCH 19/37] collapse dtype --- cf/data/collapse.py | 36 ++++++++++++++++----------- cf/test/test_Data.py | 58 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 14 deletions(-) diff --git a/cf/data/collapse.py b/cf/data/collapse.py index 08e9a0f8fa..42d91509be 100644 --- a/cf/data/collapse.py +++ b/cf/data/collapse.py @@ -592,10 +592,9 @@ def sum( """ check_input_dtype(a) - if weights is None: - dtype = double_precision_dtype(a) - else: - dtype = "f8" + dtype = double_precision_dtype(a) + if weights is not None: + dtype = np.result_type(double_precision_dtype(weights), dtype) return reduction( a, @@ -654,7 +653,7 @@ def sum_of_weights( """ check_input_dtype(a) - dtype = "f8" + dtype = double_precision_dtype(weights, default="i8") return reduction( a, cf_sum_of_weights_chunk, @@ -712,7 +711,7 @@ def sum_of_weights2( """ check_input_dtype(a) - dtype = "f8" + dtype = double_precision_dtype(weights, default="i8") return reduction( a, partial(cf_sum_of_weights_chunk, square=True), @@ -815,13 +814,17 @@ def check_input_dtype(a, allowed="fib"): raise TypeError(f"Can't calculate {method} of data with {a.dtype!r}") -def double_precision_dtype(a, bool_type="i"): +def double_precision_dtype(a, default=None, bool_type="i"): """Returns the corresponding double precision data type of an array. :Parameters: - a: `dask.array.Array` - The data. + a: `dask.array.Array` or `None` + The data. If `None` then the value of *default* is + returned* + + default: `str`, optional + If *a* is `None`, then return this data type. bool_type: `str`, optional The corresponding double data type kind for Boolean @@ -843,10 +846,16 @@ def double_precision_dtype(a, bool_type="i"): f8 f8 i8 + >>> double_precision_dtype(np.array(1, dtype=bool), bool_type='f') 'f8' + >>> double_precision_dtype(None, default="i8") + 'i8' """ + if a is None: + return default + kind = a.dtype.kind if kind == "b": return bool_type + "8" @@ -915,9 +924,7 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): return x -def sum_weights_chunk( - x, weights=None, square=False, N=None, dtype="f8", **kwargs -): +def sum_weights_chunk(x, weights=None, square=False, N=None, **kwargs): """Sum the weights. .. versionadded:: TODODASK @@ -948,7 +955,7 @@ def sum_weights_chunk( :Returns: `numpy.ndarray` - The sum of the weights. + The sum of the weights, with data type "i8" or "f8". """ if weights is None: @@ -956,10 +963,11 @@ def sum_weights_chunk( # the squares of the weights are both equal to the sample # size. if N is None: - return cf_sample_size_chunk(x, **kwargs)["N"] + N = cf_sample_size_chunk(x, **kwargs)["N"] return N + dtype = double_precision_dtype(weights) if square: weights = np.multiply(weights, weights, dtype=dtype) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 33de950785..a07675f781 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -3716,6 +3716,64 @@ def test_Data_collapse_keepdims(self): # TODODASK - add in mean_of_upper_decile + def test_Data_collapse_dtype(self): + d = cf.Data([1, 2, 3, 4], dtype="i4", chunks=2) + e = cf.Data([1.0, 2, 3, 4], dtype="f4", chunks=2) + self.assertTrue(d.dtype, "i4") + self.assertTrue(e.dtype, "f4") + + for x, r in zip((d, e), ("i4", "f4")): + for func in ( + x.min, + x.minimum_absolute_value, + x.max, + x.maximum_absolute_value, + x.range, + ): + self.assertEqual(func().dtype, r) + + for x, r in zip((d, e), ("i8", "f8")): + for func in ( + x.integral, + x.sum, + x.sum_of_squares, + ): + self.assertEqual(func().dtype, r) + + for x, r in zip((d, e), ("f8", "f8")): + for func in ( + x.mean, + x.mean_absolute_value, + x.median, + x.mid_range, + x.root_mean_square, + x.std, + x.var, + ): + self.assertEqual(func().dtype, r) + + x = d + for func in ( + x.sum_of_weights, + x.sum_of_weights2, + ): + self.assertEqual(func().dtype, "i8") + + # Weights + w_int = cf.Data(1, dtype="i4") + w_float = cf.Data(1.0, dtype="f4") + for w, r in zip((w_int, w_float), ("i8", "f8")): + for func in ( + d.integral, + d.sum, + d.sum_of_squares, + d.sum_of_weights, + d.sum_of_weights2, + ): + self.assertTrue(func(weights=w).dtype, r) + + # TODODASK - add in mean_of_upper_decile + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From 728a47ae284cf0df75812a45886a57cce3aa1024 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 12 Apr 2022 09:04:03 +0100 Subject: [PATCH 20/37] Typo Co-authored-by: Sadie L. Bartholomew --- cf/docstring/docstring.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index dae17ec8d7..26537d7cec 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -245,7 +245,7 @@ collapsed, resulting in output with size 1. Each axis is identified by its integer position. If *axes* is an empty sequence then the collapse is applied to each - scalar element and the reuslt has the same shape as + scalar element and the result has the same shape as the input data.""", # collapse squeeze "{{collapse squeeze: `bool`, optional}}": """squeeze: `bool`, optional From dc7bf534f47e82552f99ce3a8bd6cd891dffdff4 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 12 Apr 2022 09:04:49 +0100 Subject: [PATCH 21/37] Typo Co-authored-by: Sadie L. Bartholomew --- docs/source/field_analysis.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/field_analysis.rst b/docs/source/field_analysis.rst index 87c11d5850..4f3a3142ec 100644 --- a/docs/source/field_analysis.rst +++ b/docs/source/field_analysis.rst @@ -147,7 +147,7 @@ Method Description Cell met .. math:: \max\{|x_0|, \ldots, |x_N|\} -``'minimum_absolute_value'`` The minimum of the absolute absolute ``minimum_absolute_value`` +``'minimum_absolute_value'`` The minimum of the non-missing absolute ``minimum_absolute_value`` values. .. math:: \min\{|x_0|, \ldots, |x_N|\} From b47fc2da0abb966d8e07f6f3c609ab3cbe7a5d15 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 12 Apr 2022 09:05:03 +0100 Subject: [PATCH 22/37] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index 0ccb0e0a62..e9d8386c90 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -7394,7 +7394,7 @@ def mean_absolute_value( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} From 816925ae0bf3a361439470c82d15b431c130d9d5 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 12 Apr 2022 09:06:25 +0100 Subject: [PATCH 23/37] Typos --- cf/data/data.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 0ccb0e0a62..8d4f210d17 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1902,7 +1902,7 @@ def median( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{inplace: `bool`, optional}} @@ -1968,7 +1968,7 @@ def mean_of_upper_decile( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} TODODASK - note that mtol only applies to the calculation of the upper decile, not the @@ -2142,7 +2142,7 @@ def percentile( is guaranteed to broadcast correctly against the original data. - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -7036,7 +7036,7 @@ def max( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -7105,7 +7105,7 @@ def maximum_absolute_value( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -7175,7 +7175,7 @@ def min( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -7243,7 +7243,7 @@ def minimum_absolute_value( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -7316,7 +7316,7 @@ def mean( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -7394,7 +7394,7 @@ def mean_absolute_value( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -7471,7 +7471,7 @@ def integral( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -7563,7 +7563,7 @@ def sample_size( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -10299,7 +10299,7 @@ def mid_range( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -10614,7 +10614,7 @@ def root_mean_square( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -12204,7 +12204,7 @@ def range( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -12327,7 +12327,7 @@ def sum( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -12406,7 +12406,7 @@ def sum_of_squares( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -12491,7 +12491,7 @@ def sum_of_weights( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -12590,7 +12590,7 @@ def sum_of_weights2( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{split_every: `int` or `dict`, optional}} @@ -12685,7 +12685,7 @@ def std( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{ddof: number}} @@ -12773,7 +12773,7 @@ def var( {{collapse squeeze: `bool`, optional}} - {{mtol: number, optional} + {{mtol: number, optional}} {{ddof: number}} From 61567b267007ec89257e7cf33575965507796b2d Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 12 Apr 2022 09:14:15 +0100 Subject: [PATCH 24/37] keep source in harden/soften mask --- cf/data/data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index b24b9383e9..725d4523dc 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -43,7 +43,6 @@ from ..functions import rtol as cf_rtol from ..mixin_container import Container from ..units import Units - from . import NetCDFArray, UMArray from .collapse import Collapse from .creation import ( @@ -9112,7 +9111,7 @@ def harden_mask(self): """ dx = self.to_dask_array() dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) - self._set_dask(dx, reset_mask_hardness=False) + self._set_dask(dx, delete_source=False, reset_mask_hardness=False) self._hardmask = True def has_calendar(self): @@ -9209,7 +9208,7 @@ def soften_mask(self): """ dx = self.to_dask_array() dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) - self._set_dask(dx, reset_mask_hardness=False) + self._set_dask(dx, delete_source=False, reset_mask_hardness=False) self._hardmask = False @daskified(_DASKIFIED_VERBOSE) From 6f6ad335037d541fc9ee3aff79c126c61399e5f2 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 13 Apr 2022 15:43:11 +0100 Subject: [PATCH 25/37] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index 725d4523dc..540baa3d06 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -5191,7 +5191,7 @@ def chunks(self): @property def force_compute(self): - """TODODASK See also confg settings.""" + """TODODASK See also config settings.""" return self._custom.get("force_compute", False) @force_compute.setter From 788fbae039590c6f006a20cc0886c4ebb56cb440 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 13 Apr 2022 15:43:36 +0100 Subject: [PATCH 26/37] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index 540baa3d06..fec70f3fb5 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -9822,7 +9822,7 @@ def override_calendar(self, calendar, inplace=False, i=False): def to_dask_array(self): """Store the data array on disk. - There is no change to partition's whose sub-arrays are already on + There is no change to partitions whose sub-arrays are already on disk. :Returns: From c87d1d3209d6ae100a5ff6c07c23d689baf1ffdb Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 13 Apr 2022 15:44:18 +0100 Subject: [PATCH 27/37] Corrected docs Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index fec70f3fb5..b3da85914f 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -12946,7 +12946,7 @@ def section( @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) def square(self, dtype=None, inplace=False): - """Calculate the non-negative square root. + """Calculate the element-wise square. .. versionadded:: TODODASK From 3c65cbdda35ba7f1ec8f6a24eb8d3c3d4de5572b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 13 Apr 2022 15:44:38 +0100 Subject: [PATCH 28/37] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index b3da85914f..c306af97f5 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -13028,7 +13028,7 @@ def sqrt(self, dtype=None, inplace=False): [[0.0 1.0 1.4142135623730951 -- 2.0]] Negative values raise a warning but nonetheless result in NaN - or, if the there are already missing values, missing data: + or, if there are already missing values, missing data: >>> import warnings >>> d = cf.Data([0, 1, -4]) From f7fc780d1495e518436dee2031fd92c7111a3ae0 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 13 Apr 2022 15:45:15 +0100 Subject: [PATCH 29/37] Clarity Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index c306af97f5..fb6f1f0a16 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -13027,7 +13027,7 @@ def sqrt(self, dtype=None, inplace=False): >>> print(e.array) [[0.0 1.0 1.4142135623730951 -- 2.0]] - Negative values raise a warning but nonetheless result in NaN + Negative input values raise a warning but nonetheless result in NaN or, if there are already missing values, missing data: >>> import warnings From ff48ef80e119baaccc60a3ac7009df854c666264 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 13 Apr 2022 15:47:51 +0100 Subject: [PATCH 30/37] Correct return statement in alias Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index fb6f1f0a16..2035156a90 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -13128,7 +13128,7 @@ def sd( i=False, ): """Alias for `std`""" - return self.sdt( + return self.std( axes=axes, squeeze=squeeze, weights=weights, From fe2f8823072789db5c5c975f20f18905b8c799e6 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 13 Apr 2022 15:48:24 +0100 Subject: [PATCH 31/37] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/collapse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/collapse.py b/cf/data/collapse.py index 42d91509be..23954a7566 100644 --- a/cf/data/collapse.py +++ b/cf/data/collapse.py @@ -821,7 +821,7 @@ def double_precision_dtype(a, default=None, bool_type="i"): a: `dask.array.Array` or `None` The data. If `None` then the value of *default* is - returned* + returned*. default: `str`, optional If *a* is `None`, then return this data type. From 95201901dabd7dce96644668f727767217546180 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 13 Apr 2022 15:50:47 +0100 Subject: [PATCH 32/37] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/collapse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/collapse.py b/cf/data/collapse.py index 23954a7566..1a0cf07386 100644 --- a/cf/data/collapse.py +++ b/cf/data/collapse.py @@ -863,7 +863,7 @@ def double_precision_dtype(a, default=None, bool_type="i"): if kind in "fi": return kind + "8" - raise TypeError("Can't collapse data with {a.dtype!r}") + raise TypeError(f"Can't collapse data with {a.dtype!r}") def mask_small_sample_size(x, N, axis, mtol, original_shape): From 69ed1e8bbf57f8dbe7ab8e5dca4522159481527e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 13 Apr 2022 15:51:23 +0100 Subject: [PATCH 33/37] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/collapse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/collapse.py b/cf/data/collapse.py index 1a0cf07386..5b938f99a1 100644 --- a/cf/data/collapse.py +++ b/cf/data/collapse.py @@ -1573,7 +1573,7 @@ def cf_range_agg( # root mean square # -------------------------------------------------------------------- def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): - """Chunk calculations for the root mean square (RMS).. + """Chunk calculations for the root mean square (RMS). This function is passed to `dask.array.reduction` as its *chunk* parameter. From 441f74bb6585e96c6aef5dd47cd615a8d623ba1e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 13 Apr 2022 16:05:41 +0100 Subject: [PATCH 34/37] collapse datea type comments --- cf/test/test_Data.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 79dd86dc70..f01c35779c 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -3750,6 +3750,8 @@ def test_Data_collapse_dtype(self): self.assertTrue(d.dtype, "i4") self.assertTrue(e.dtype, "f4") + # Cases for which both d and e collapse to a result of the + # same data type for x, r in zip((d, e), ("i4", "f4")): for func in ( x.min, @@ -3760,6 +3762,8 @@ def test_Data_collapse_dtype(self): ): self.assertEqual(func().dtype, r) + # Cases for which both d and e collapse to a result of the + # double of same data type for x, r in zip((d, e), ("i8", "f8")): for func in ( x.integral, @@ -3768,6 +3772,8 @@ def test_Data_collapse_dtype(self): ): self.assertEqual(func().dtype, r) + # Cases for which both d and e collapse to a result of double + # float data type for x, r in zip((d, e), ("f8", "f8")): for func in ( x.mean, From 0d87f44f7b5c96bae0493b602eee7a660403354b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 13 Apr 2022 16:14:27 +0100 Subject: [PATCH 35/37] deprecate _HDF_chunks --- cf/data/data.py | 17 ----------------- cf/data/mixin/deprecations.py | 25 +++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 2035156a90..17844a9f25 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -5110,23 +5110,6 @@ def _cyclic(self, value): def _cyclic(self): self._custom["_cyclic"] = _empty_set - @property - def _HDF_chunks(self): - """The HDF chunksizes. - - DO NOT CHANGE IN PLACE. - - """ - return self._custom["_HDF_chunks"] - - @_HDF_chunks.setter - def _HDF_chunks(self, value): - self._custom["_HDF_chunks"] = value - - @_HDF_chunks.deleter - def _HDF_chunks(self): - del self._custom["_HDF_chunks"] - @property @daskified(_DASKIFIED_VERBOSE) def _hardmask(self): diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index 34c910c62c..bcb1921b2e 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -69,6 +69,31 @@ def __hash__(self): removed_at="5.0.0", ) + @property + def _HDF_chunks(self): + """The HDF chunksizes. + + Deprecated at version TODODASK. + + DO NOT CHANGE IN PLACE. + + """ + _DEPRECATION_ERROR_ATTRIBUTE( + self, "_HDF_chunks", version="TODODASK", removed_at="5.0.0" + ) # pragma: no cover + + @_HDF_chunks.setter + def _HDF_chunks(self, value): + _DEPRECATION_ERROR_ATTRIBUTE( + self, "_HDF_chunks", version="TODODASK", removed_at="5.0.0" + ) # pragma: no cover + + @_HDF_chunks.deleter + def _HDF_chunks(self): + _DEPRECATION_ERROR_ATTRIBUTE( + self, "_HDF_chunks", version="TODODASK", removed_at="5.0.0" + ) # pragma: no cover + @property def Data(self): """Deprecated at version 3.0.0, use attribute `data` instead.""" From 82d3508e7872365f531cabd469a550b66c2ebd91 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 19 Apr 2022 18:53:47 +0100 Subject: [PATCH 36/37] Data.sqrt: incompatible units --- cf/data/data.py | 7 ++++++- cf/test/test_Data.py | 5 +++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index 17844a9f25..254581cd9c 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -13040,7 +13040,12 @@ def sqrt(self, dtype=None, inplace=False): units = d.Units if units: - d.override_units(units ** 0.5, inplace=True) + try: + d.override_units(units ** 0.5, inplace=True) + except ValueError as e: + raise type(e)( + f"Incompatible units for taking a square root: {units!r}" + ) return d diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index f01c35779c..2aed0e1ccd 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -3223,6 +3223,11 @@ def test_Data_sqrt(self): self.assertEqual(e.dtype, asqrt.dtype) self.assertTrue((e.array == asqrt).all()) + # Incompatible units + d = cf.Data(a, "m") + with self.assertRaises(ValueError): + d.sqrt() + def test_Data_integral(self): # Masked array, non-masked weights a = self.ma From 60fcb17b57043379eb0c2763b88bda1f97a01e62 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 20 Apr 2022 08:58:21 +0100 Subject: [PATCH 37/37] cell methods clarification --- docs/source/field_analysis.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/field_analysis.rst b/docs/source/field_analysis.rst index 4f3a3142ec..fe2b323451 100644 --- a/docs/source/field_analysis.rst +++ b/docs/source/field_analysis.rst @@ -125,8 +125,9 @@ Collapse methods ^^^^^^^^^^^^^^^^ The following collapse methods are available, over any subset of the -domain axes. The "Cell method" column in the table gives the method of -the new cell method construct (if one is created). +domain axes. The "Cell method" column in the table gives the method +name, defined by the CF conventions, of the new cell method construct +(if one is created). ============================ ======================================== ========================== Method Description Cell method