From e18182f2427e4aeff3d8d93c75cb276fec56c3e1 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 2 Feb 2022 12:02:46 +0000 Subject: [PATCH 1/5] daskify cf.Data.digitize --- cf/data/data.py | 87 ++++++++++++++++++-------------------------- cf/test/test_Data.py | 36 +++++++++++++----- 2 files changed, 62 insertions(+), 61 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index a5cf4bf9fc..a5baa23711 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1656,6 +1656,7 @@ def dumps(self): return json_dumps(d, default=convert_to_builtin_type) + @_inplace_enabled(default=False) def digitize( self, bins, @@ -1663,6 +1664,7 @@ def digitize( open_ends=False, closed_ends=None, return_bins=False, + inplace=False, ): """Return the indices of the bins to which each value belongs. @@ -1747,6 +1749,8 @@ def digitize( return_bins: `bool`, optional If True then also return the bins in their 2-d form. + {{inplace: `bool`, optional}} + :Returns: `Data`, [`Data`] @@ -1755,7 +1759,7 @@ def digitize( If *return_bins* is True then also return the bins in their 2-d form. - **Examples:** + **Examples** >>> d = cf.Data(numpy.arange(12).reshape(3, 4)) [[ 0 1 2 3] @@ -1811,9 +1815,11 @@ def digitize( [ 1 1 1 --]] """ - out = self.copy() + from dask.array import digitize + + d = _inplace_enabled_define_and_cleanup(self) - org_units = self.Units + org_units = d.Units bin_units = getattr(bins, "Units", None) @@ -1830,12 +1836,16 @@ def digitize( else: bin_units = org_units - bins = np.asanyarray(bins) + # Get bins as a numpy array + if isinstance(bins, np.ndarray): + bins = bins.copy() + else: + bins = np.asanyarray(bins) if bins.ndim > 2: raise ValueError( "The 'bins' parameter must be scalar, 1-d or 2-d" - "Got: {!r}".format(bins) + f"Got: {bins!r}" ) two_d_bins = None @@ -1848,7 +1858,7 @@ def digitize( if bins.shape[1] != 2: raise ValueError( "The second dimension of the 'bins' parameter must " - "have size 2. Got: {!r}".format(bins) + f"have size 2. Got: {bins!r}" ) bins.sort(axis=1) @@ -1858,11 +1868,9 @@ def digitize( for i, (u, l) in enumerate(zip(bins[:-1, 1], bins[1:, 0])): if u > l: raise ValueError( - "Overlapping bins: {}, {}".format( - tuple(bins[i]), tuple(bins[i + i]) - ) + f"Overlapping bins: " + f"{tuple(bins[i])}, {tuple(bins[i + i])}" ) - # --- End: for two_d_bins = bins bins = np.unique(bins) @@ -1900,8 +1908,8 @@ def digitize( "scalar." ) - mx = self.max().datum() - mn = self.min().datum() + mx = d.max().datum() + mn = d.min().datum() bins = np.linspace(mn, mx, int(bins) + 1, dtype=float) delete_bins = [] @@ -1913,7 +1921,8 @@ def digitize( "Can't set open_ends=True when closed_ends is True." ) - bins = bins.astype(float, copy=True) + if bins.dtype.kind != "f": + bins = bins.astype(float, copy=False) epsilon = np.finfo(float).eps ndim = bins.ndim @@ -1923,53 +1932,27 @@ def digitize( else: mx = bins[(-1,) * ndim] bins[(-1,) * ndim] += abs(mx) * epsilon - # --- End: if if not open_ends: delete_bins.insert(0, 0) delete_bins.append(bins.size) - if return_bins and two_d_bins is None: - x = np.empty((bins.size - 1, 2), dtype=bins.dtype) - x[:, 0] = bins[:-1] - x[:, 1] = bins[1:] - two_d_bins = x - - config = out.partition_configuration(readonly=True) - - for partition in out.partitions.matrix.flat: - partition.open(config) - array = partition.array - - mask = None - if np.ma.isMA(array): - mask = array.mask.copy() - - array = np.digitize(array, bins, right=upper) - - if delete_bins: - for n, d in enumerate(delete_bins): - d -= n - array = np.ma.where(array == d, np.ma.masked, array) - array = np.ma.where(array > d, array - 1, array) - # --- End: if - - if mask is not None: - array = np.ma.where(mask, np.ma.masked, array) - - partition.subarray = array - partition.Units = _units_None - - partition.close() - - out.dtype = int - - out.override_units(_units_None, inplace=True) + # Digitise the array + dx = d._get_dask() + dx = digitize(dx, bins, right=upper) + d._set_dask(dx, reset_mask_hardness=True) + d.override_units(_units_None, inplace=True) if return_bins: - return out, type(self)(two_d_bins, units=bin_units) + if two_d_bins is None: + two_d_bins = np.empty((bins.size - 1, 2), dtype=bins.dtype) + two_d_bins[:, 0] = bins[:-1] + two_d_bins[:, 1] = bins[1:] - return out + two_d_bins = type(self)(two_d_bins, units=bin_units) + return d, two_d_bins + + return d def median( self, diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 66421119af..36e1af09f4 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -804,7 +804,6 @@ def test_Data__init__dtype_mask(self): self.assertTrue((d.array == a).all()) self.assertTrue((d.mask.array == np.ma.getmaskarray(a)).all()) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_digitize(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -830,14 +829,33 @@ def test_Data_digitize(self): self.assertTrue((e.array == b).all()) - e.where( - cf.set([e.minimum(), e.maximum()]), - cf.masked, - e - 1, - inplace=True, - ) - f = d.digitize(bins, upper=upper) - self.assertTrue(e.equals(f, verbose=2)) + # TODODASK: Reinstate the following test when + # __sub__, minimum, and maximum have + # been daskified + + # e.where( + # cf.set([e.minimum(), e.maximum()]), + # cf.masked, + # e - 1, + # inplace=True, + # ) + # f = d.digitize(bins, upper=upper) + # self.assertTrue(e.equals(f, verbose=2)) + + # Check returned bins + bins = [2, 6, 10, 50, 100] + e, b = d.digitize(bins, return_bins=True) + self.assertTrue( + (b.array == [[2, 6], [6, 10], [10, 50], [50, 100]]).all() + ) + self.assertTrue(b.Units == d.Units) + + # Check digitized units + self.assertTrue(e.Units == cf.Units(None)) + + # Check inplace + self.assertIsNone(d.digitize(bins, inplace=True)) + self.assertTrue(d.equals(e)) @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_cumsum(self): From 84feaddf4596795695517a26630b4f0a703192ed Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 4 Feb 2022 08:32:26 +0000 Subject: [PATCH 2/5] add mask Data.digitize mask test --- cf/test/test_Data.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 36e1af09f4..7425e7a460 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -828,6 +828,9 @@ def test_Data_digitize(self): b = np.digitize(a, [2, 6, 10, 50, 100], right=upper) self.assertTrue((e.array == b).all()) + self.assertTrue( + (np.ma.getmask(e.array) == np.ma.getmask(b)).all() + ) # TODODASK: Reinstate the following test when # __sub__, minimum, and maximum have From fac074dabb44633c9c89183b718f8e0a54d4c939 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 4 Feb 2022 17:36:19 +0000 Subject: [PATCH 3/5] add decorator Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cf/data/data.py b/cf/data/data.py index a5baa23711..da8c602021 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1656,6 +1656,7 @@ def dumps(self): return json_dumps(d, default=convert_to_builtin_type) + @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) def digitize( self, From f627f217dd98073eb654c283b15c3f460c82880f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 4 Feb 2022 17:38:45 +0000 Subject: [PATCH 4/5] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index da8c602021..27a41ad538 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1846,7 +1846,7 @@ def digitize( if bins.ndim > 2: raise ValueError( "The 'bins' parameter must be scalar, 1-d or 2-d" - f"Got: {bins!r}" + f" Got: {bins!r}" ) two_d_bins = None From a66922121f12e7897e01bbe8e0d3f9eb4e806fc3 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 4 Feb 2022 17:40:41 +0000 Subject: [PATCH 5/5] remove unnecessary import --- cf/data/data.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 27a41ad538..9819ab0f68 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1816,8 +1816,6 @@ def digitize( [ 1 1 1 --]] """ - from dask.array import digitize - d = _inplace_enabled_define_and_cleanup(self) org_units = d.Units @@ -1845,8 +1843,8 @@ def digitize( if bins.ndim > 2: raise ValueError( - "The 'bins' parameter must be scalar, 1-d or 2-d" - f" Got: {bins!r}" + "The 'bins' parameter must be scalar, 1-d or 2-d. " + f"Got: {bins!r}" ) two_d_bins = None @@ -1940,7 +1938,7 @@ def digitize( # Digitise the array dx = d._get_dask() - dx = digitize(dx, bins, right=upper) + dx = da.digitize(dx, bins, right=upper) d._set_dask(dx, reset_mask_hardness=True) d.override_units(_units_None, inplace=True)