From f70c7280476762b4c742ad321a671106b76b9df5 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 29 Mar 2022 14:03:53 +0100 Subject: [PATCH 1/4] count, count_masked --- cf/data/data.py | 116 ++++++++++++++++++++++++------------------- cf/test/test_Data.py | 38 ++++++++++---- 2 files changed, 92 insertions(+), 62 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index c93d48f7d4..3e9da9d51c 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -8744,96 +8744,110 @@ def cos(self, inplace=False, i=False): return d - def count(self): + @daskified(_DASKIFIED_VERBOSE) + def count(self, axis=None, keepdims=False, split_every=None): """Count the non-masked elements of the data. .. seealso:: `count_masked` + :Parameters: + + axis: (sequence of) `int`, optional + Axis or axes along which the count is performed. The + default (`None`) performs the count over all the + dimensions of the input array. *axis* may be negative, + in which case it counts from the last to the first + axis. + + {{collapse keepdims: `bool`, optional}} + + {{split_every: `int` or `dict`, optional}} + :Returns: - ``int`` + `Data` + The count of non-missing elements. - **Examples:** + **Examples** - >>> d = cf.Data(numpy.arange(24).reshape(3, 4)) + >>> d = cf.Data(numpy.arange(12).reshape(3, 4)) >>> print(d.array) [[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]] >>> d.count() - 12 + + >>> d[0, :] = cf.masked >>> print(d.array) [[-- -- -- --] [ 4 5 6 7] [ 8 9 10 11]] >>> d.count() - 8 + >>> print(d.count(0).array) [2 2 2 2] >>> print(d.count(1).array) [0 4 4] - >>> print(d.count((0, 1))) + >>> print(d.count([0, 1])) 8 """ - # TODODASK - daskify, previously parallelise=mpi_on (not =False) - config = self.partition_configuration(readonly=True) + d = self.copy(array=False) + dx = self._get_dask() + dx = da.ma.count( + dx, axis=axis, keepdims=keepdims, split_every=split_every + ) + d._set_dask(dx, reset_mask_hardness=False) + d.hardmask = _DEFAULT_HARDMASK + d.override_units(_units_None, inplace=True) + return d - n = 0 + @daskified(_DASKIFIED_VERBOSE) + def count_masked(self, split_every=None): + """Count the masked elements of the data. - # self._flag_partitions_for_processing(parallelise=mpi_on) + .. seealso:: `count` - processed_partitions = [] - for pmindex, partition in self.partitions.ndenumerate(): - if partition._process_partition: - partition.open(config) - partition._pmindex = pmindex - array = partition.array - n += np.ma.count(array) - partition.close() - processed_partitions.append(partition) - # --- End: if - # --- End: for + :Parameters: - # processed_partitions contains a list of all the partitions - # that have been processed on this rank. In the serial case - # this is all of them and this line of code has no - # effect. Otherwise the processed partitions from each rank - # are distributed to every rank and processed_partitions now - # contains all the processed partitions from every rank. - processed_partitions = self._share_partitions( - processed_partitions, parallelise=False - ) + {{split_every: `int` or `dict`, optional}} - # Put the processed partitions back in the partition matrix - # according to each partitions _pmindex attribute set above. - pm = self.partitions.matrix - for partition in processed_partitions: - pm[partition._pmindex] = partition - # --- End: for + :Returns: - # Share the lock files created by each rank for each partition - # now in a temporary file so that __del__ knows which lock - # files to check if present - self._share_lock_files(parallelise=False) + `Data` + The count of missing elements. - # Aggregate the results on each process and return on all - # processes - # if mpi_on: - # n = mpi_comm.allreduce(n, op=mpi_sum) - # --- End: if + **Examples** - return n + >>> d = cf.Data(numpy.arange(12).reshape(3, 4)) + >>> print(d.array) + [[ 0 1 2 3] + [ 4 5 6 7] + [ 8 9 10 11]] + >>> d.count_masked() + - def count_masked(self): - """Count the masked elements of the data. + >>> d[0, :] = cf.masked + >>> print(d.array) + [[-- -- -- --] + [ 4 5 6 7] + [ 8 9 10 11]] + >>> d.count() + - .. seealso:: `count` + >>> print(d.count(0).array) + [1 1 1 1] + >>> print(d.count(1).array) + [4 0 0] + >>> print(d.count([0, 1])) + 4 """ - return self._size - self.count() + # TODODASK: Make a PR for da.ma.count_masked and follow the + # pattern of cf.data.count + return self.size - self.count(split_every=split_every) @daskified(_DASKIFIED_VERBOSE) def cyclic(self, axes=None, iscyclic=True): diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 7e991b8f68..e6e4f1a0af 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -3344,18 +3344,34 @@ def test_Data_section(self): e = cf.Data.reconstruct_sectioned_data(x) self.assertTrue(e.equals(d)) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_count(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - d = cf.Data(ma) - self.assertEqual(d.count(), 284, d.count()) - self.assertEqual(d.count_masked(), d.size - 284, d.count_masked()) - - d = cf.Data(a) - self.assertEqual(d.count(), d.size) - self.assertEqual(d.count_masked(), 0) + d = cf.Data(np.arange(12).reshape(3, 4), "m", chunks=2) + c = d.count() + self.assertEqual(c.array, 12) + self.assertEqual(c.Units, cf.Units()) + + d[0, :] = cf.masked + self.assertEqual(d.count().array, 8) + self.assertEqual(d.count([0, 1]).array, 8) + self.assertTrue((d.count(0).array == [2, 2, 2, 2]).all()) + self.assertTrue((d.count(1).array == [0, 4, 4]).all()) + + self.assertEqual(d.count(keepdims=False).shape, ()) + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "Needs __sub__") + def test_Data_count_masked(self): + d = cf.Data(np.arange(12).reshape(3, 4), "m", chunks=2) + c = d.count_masked() + self.assertEqual(c.array, 0) + self.assertEqual(c.Units, cf.Units()) + + d[0, :] = cf.masked + self.assertEqual(d.count_masked().array, 4) + self.assertEqual(d.count_masked([0, 1]).array, 4) + self.assertTrue((d.count_masked(0).array == [1, 1, 1, 1]).all()) + self.assertTrue((d.count_masked(1).array == [4, 0, 0]).all()) + + self.assertEqual(d.count_masked(keepdims=False).shape, ()) def test_Data_exp(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: From 8063fa0d8e77bc902b117378d241c2a56d70bb2a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 6 Apr 2022 12:44:05 +0100 Subject: [PATCH 2/4] count, count_masked --- cf/data/data.py | 29 +++++++++++------------------ cf/functions.py | 3 +-- cf/test/test_Data.py | 10 ++-------- 3 files changed, 14 insertions(+), 28 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 189b4619cb..2a460b5076 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -8666,7 +8666,7 @@ def cos(self, inplace=False, i=False): return d @daskified(_DASKIFIED_VERBOSE) - def count(self, axis=None, keepdims=False, split_every=None): + def count(self, axis=None, keepdims=True, split_every=None): """Count the non-masked elements of the data. .. seealso:: `count_masked` @@ -8697,7 +8697,7 @@ def count(self, axis=None, keepdims=False, split_every=None): [ 4 5 6 7] [ 8 9 10 11]] >>> d.count() - + >>> d[0, :] = cf.masked >>> print(d.array) @@ -8705,13 +8705,15 @@ def count(self, axis=None, keepdims=False, split_every=None): [ 4 5 6 7] [ 8 9 10 11]] >>> d.count() - + >>> print(d.count(0).array) - [2 2 2 2] + [[2 2 2 2]] >>> print(d.count(1).array) - [0 4 4] - >>> print(d.count([0, 1])) + [[0] + [4] + [4]] + >>> print(d.count([0, 1], keepdims=False).array) 8 """ @@ -8748,26 +8750,17 @@ def count_masked(self, split_every=None): [ 4 5 6 7] [ 8 9 10 11]] >>> d.count_masked() - + >>> d[0, :] = cf.masked >>> print(d.array) [[-- -- -- --] [ 4 5 6 7] [ 8 9 10 11]] - >>> d.count() - - - >>> print(d.count(0).array) - [1 1 1 1] - >>> print(d.count(1).array) - [4 0 0] - >>> print(d.count([0, 1])) - 4 + >>> d.count_masked() + """ - # TODODASK: Make a PR for da.ma.count_masked and follow the - # pattern of cf.data.count return self.size - self.count(split_every=split_every) @daskified(_DASKIFIED_VERBOSE) diff --git a/cf/functions.py b/cf/functions.py index ebb12c20fa..4c8355a38f 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -13,7 +13,6 @@ from collections.abc import Iterable from itertools import product from marshal import dumps -from math import ceil as math_ceil from numbers import Integral from os import getpid, listdir, mkdir from os.path import abspath as _os_path_abspath @@ -25,9 +24,9 @@ import cfdm import netCDF4 +import numpy as np from dask import config from dask.utils import parse_bytes -import numpy as np from numpy import __file__ as _numpy__file__ from numpy import __version__ as _numpy__version__ from numpy import all as _numpy_all diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 1e5268a917..435999203d 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -3353,8 +3353,8 @@ def test_Data_count(self): d[0, :] = cf.masked self.assertEqual(d.count().array, 8) self.assertEqual(d.count([0, 1]).array, 8) - self.assertTrue((d.count(0).array == [2, 2, 2, 2]).all()) - self.assertTrue((d.count(1).array == [0, 4, 4]).all()) + self.assertTrue((d.count(0).array == [[2, 2, 2, 2]]).all()) + self.assertTrue((d.count(1).array == [[0], [4], [4]]).all()) self.assertEqual(d.count(keepdims=False).shape, ()) @@ -3367,11 +3367,6 @@ def test_Data_count_masked(self): d[0, :] = cf.masked self.assertEqual(d.count_masked().array, 4) - self.assertEqual(d.count_masked([0, 1]).array, 4) - self.assertTrue((d.count_masked(0).array == [1, 1, 1, 1]).all()) - self.assertTrue((d.count_masked(1).array == [4, 0, 0]).all()) - - self.assertEqual(d.count_masked(keepdims=False).shape, ()) def test_Data_exp(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: @@ -3391,7 +3386,6 @@ def test_Data_exp(self): # self.assertTrue((d.array==c).all()) so need a # check which accounts for floating point calcs: np.testing.assert_allclose(d.array, c) - # --- End: for d = cf.Data(a, "m") with self.assertRaises(Exception): From fe70352149ac03e3f242a546bcd8f17015f580dc Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 22 Jun 2022 11:56:57 +0100 Subject: [PATCH 3/4] Data.count --- cf/test/test_Data.py | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index ce66c83a29..4c00517bc7 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -2042,12 +2042,12 @@ def test_Data_BINARY_AND_UNARY_OPERATORS(self): ) try: - d ** x + d**x except Exception: pass else: message = "Failed in {!r}**{!r}".format(d, x) - self.assertTrue((d ** x).all(), message) + self.assertTrue((d**x).all(), message) # --- End: for for a0 in arrays: @@ -2121,12 +2121,12 @@ def test_Data_BINARY_AND_UNARY_OPERATORS(self): ) try: - x ** d + x**d except Exception: pass else: message = "Failed in {}**{!r}".format(x, d) - self.assertTrue((x ** d).all(), message) + self.assertTrue((x**d).all(), message) a = a0.copy() try: @@ -2241,12 +2241,12 @@ def test_Data_BINARY_AND_UNARY_OPERATORS(self): ) try: - d ** x + d**x except Exception: pass else: self.assertTrue( - (x ** d).all(), "{}**{}".format(x, repr(d)) + (x**d).all(), "{}**{}".format(x, repr(d)) ) self.assertTrue( @@ -2489,18 +2489,18 @@ def test_Data_section(self): self.assertEqual(key, (None, None, None)) self.assertTrue(value.equals(d)) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_count(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return + d = cf.Data(np.arange(24).reshape(2, 3, 4)) + self.assertEqual(d.count().array, 24) + for axis, c in enumerate(d.shape): + self.assertTrue((d.count(axis=axis).array == c).all()) - d = cf.Data(ma) - self.assertEqual(d.count(), 284, d.count()) - self.assertEqual(d.count_masked(), d.size - 284, d.count_masked()) + self.assertTrue((d.count(axis=[0, 1]).array == 6).all()) - d = cf.Data(a) - self.assertEqual(d.count(), d.size) - self.assertEqual(d.count_masked(), 0) + d[0, 0, 0] = np.ma.masked + self.assertEqual(d.count().array, 23) + for axis, c in enumerate(d.shape): + self.assertEqual(d.count(axis=axis).datum(0), c - 1) def test_Data_exp(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: @@ -2528,8 +2528,8 @@ def test_Data_exp(self): def test_Data_func(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - - a = np.array([[np.e, np.e ** 2, np.e ** 3.5], [0, 1, np.e ** -1]]) + + a = np.array([[np.e, np.e**2, np.e**3.5], [0, 1, np.e**-1]]) # Using sine as an example function to apply b = np.sin(a) @@ -2577,7 +2577,7 @@ def test_Data_log(self): return # Test natural log, base e - a = np.array([[np.e, np.e ** 2, np.e ** 3.5], [0, 1, np.e ** -1]]) + a = np.array([[np.e, np.e**2, np.e**3.5], [0, 1, np.e**-1]]) b = np.log(a) c = cf.Data(a, "s") d = c.log() @@ -2591,7 +2591,7 @@ def test_Data_log(self): self.assertEqual(c.shape, b.shape) # Test another base, using 10 as an example (special managed case) - a = np.array([[10, 100, 10 ** 3.5], [0, 1, 0.1]]) + a = np.array([[10, 100, 10**3.5], [0, 1, 0.1]]) b = np.log10(a) c = cf.Data(a, "s") d = c.log(base=10) @@ -2599,7 +2599,7 @@ def test_Data_log(self): self.assertEqual(d.shape, b.shape) # Test an arbitrary base, using 4 (not a special managed case like 10) - a = np.array([[4, 16, 4 ** 3.5], [0, 1, 0.25]]) + a = np.array([[4, 16, 4**3.5], [0, 1, 0.25]]) b = np.log(a) / np.log(4) # the numpy way, using log rules from school c = cf.Data(a, "s") d = c.log(base=4) @@ -3626,7 +3626,7 @@ def test_Data_collapse_units(self): self.assertEqual(func().Units, d.Units) for func in (d.sum_of_squares, d.var): - self.assertEqual(func().Units, d.Units ** 2) + self.assertEqual(func().Units, d.Units**2) for func in (d.sum_of_weights, d.sum_of_weights2): self.assertEqual(func().Units, cf.Units()) @@ -3635,7 +3635,7 @@ def test_Data_collapse_units(self): w = cf.Data(1, "m") self.assertEqual(d.integral(weights=w).Units, d.Units * w.Units) self.assertEqual(d.sum_of_weights(weights=w).Units, w.Units) - self.assertEqual(d.sum_of_weights2(weights=w).Units, w.Units ** 2) + self.assertEqual(d.sum_of_weights2(weights=w).Units, w.Units**2) # Dimensionless data d = cf.Data([1, 2]) From 5a11cb491f8f22ddb857cbe1e7a479ed6e9b7e49 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 22 Jun 2022 12:02:34 +0100 Subject: [PATCH 4/4] Data.count_masked --- cf/test/test_Data.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 5bf461eaf1..5d3050786d 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -2502,10 +2502,14 @@ def test_Data_count(self): for axis, c in enumerate(d.shape): self.assertEqual(d.count(axis=axis).datum(0), c - 1) - def test_Data_exp(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return + def test_Data_count_masked(self): + d = cf.Data(np.arange(24).reshape(2, 3, 4)) + self.assertEqual(d.count_masked().array, 0) + + d[0, 0, 0] = np.ma.masked + self.assertEqual(d.count_masked().array, 1) + def test_Data_exp(self): for x in (1, -1): a = 0.9 * x * self.ma c = np.ma.exp(a) @@ -2526,9 +2530,6 @@ def test_Data_exp(self): _ = d.exp() def test_Data_func(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - a = np.array([[np.e, np.e**2, np.e**3.5], [0, 1, np.e**-1]]) # Using sine as an example function to apply