mpi: Simplify Custom domain decomposition

georgebisbas · georgebisbas · commit 9c7e8d3076c0 · 2023-06-09T11:43:44.000+01:00
diff --git a/devito/mpi/distributed.py b/devito/mpi/distributed.py
@@ -567,31 +567,29 @@ def _arg_values(self, *args, **kwargs):
 class CustomTopology(tuple):
 
     """
-    A CustomTopology is a mechanism to describe parametric domain decompositions.
+    The CustomTopology class provides a mechanism to describe parametric domain
+    decompositions. It allows users to specify how the dimensions of a domain are
+    decomposed into chunks based on certain parameters.
 
     Examples
     --------
-    Assuming a domain consisting of three distributed Dimensions x, y, and z, and
-    an MPI communicator comprising N processes, a CustomTopology might be:
+    For example, let's consider a domain with three distributed dimensions: x, y, and z,
+    and an MPI communicator with N processes. Here are a few examples of CustomTopology:
 
     With N known, say N=4:
-
     * `(1, 1, 4)`: the z Dimension is decomposed into 4 chunks
-    * `(2, 1, 2)`: the x Dimension is decomposed into 2 chunks; the z Dimension
+    * `(2, 1, 2)`: the x Dimension is decomposed into 2 chunks and the z Dimension
                    is decomposed into 2 chunks
 
     With N unknown:
-
-    * `(1, '*', 1)`: the wildcard `'*'` tells the runtime to decompose the y
+    * `(1, '*', 1)`: the wildcard `'*'` indicates that the runtime should decompose the y
                      Dimension into N chunks
-    * `('*', '*', 1)`: the wildcard `'*'` tells the runtime to decompose both
+    * `('*', '*', 1)`: the wildcard `'*'` indicates that the runtime should decompose both
                        the x and y Dimensions in `nstars` factors of N, prioritizing
                        the outermost dimension
 
-    Assuming N=6 and requested topology is `('*', '*', 1)`,
-    since there is no integer k, so that k*k=6, we resort to the closest factors to
-    the nstars-th root (usually square or cubic) that satisfies that the decomposed
-    domains are equal to the number of MPI processes.
+    Assuming that the number of ranks `N` cannot evenly be decomposed to the requested
+    stars=6 we decompose as evenly as possible by prioritising the outermost dimension
 
     For N=3
     * `('*', '*', 1)` gives: (3, 1, 1)
@@ -611,58 +609,48 @@ class CustomTopology(tuple):
 
     Notes
     -----
-    Users shouldn't use this class directly. It's up to the Devito runtime to
-    instantiate it based on the user input.
+    Users should not directly use the CustomTopology class. It is instantiated
+    by the Devito runtime based on user input.
     """
 
     def __new__(cls, items, input_comm):
         # Keep track of nstars and already defined decompositions
-        nstars = len([i for i in items if i == '*'])
+        nstars = items.count('*')
 
         # If no stars exist we are ready
         if nstars == 0:
             processed = items
         else:
-            # Init decomposition list
+            # Init decomposition list and track star positions
             processed = [1] * len(items)
-
-            # Get star and integer indices
-            int_pos = [i for i, item in enumerate(items) if isinstance(item, int)]
-            int_vals = [item for item in items if isinstance(item, int)]
-            star_pos = [i for i, item in enumerate(items) if not isinstance(item, int)]
-
-            # Decompose the processes remaining for allocation to prime factors
+            star_pos = []
+            for i, item in enumerate(items):
+                if isinstance(item, int):
+                    processed[i] = item
+                else:
+                    star_pos.append(i)
+
+            # Compute the remaining procs to be allocated
             alloc_procs = np.prod([i for i in items if i != '*'])
-            remprocs = int(input_comm.size // alloc_procs)
-            prime_factors = primefactors(remprocs)
-
-            star_i = -1
-            dd_list = [1] * nstars
+            rem_procs = int(input_comm.size // alloc_procs)
 
             # Start by using the max prime factor at the first starred position,
-            # then cyclically-iteratively decompose as evenly as possible until
-            # decomposing to the number of `remprocs`
-            while remprocs != 1:
-                star_i = star_i + 1
-                star_i = star_i % nstars
-                prime_factors = primefactors(remprocs)
-                dd_list[star_i] = dd_list[star_i]*max(prime_factors)
-                remprocs = remprocs // max(prime_factors)
-
-            if int_pos:
-                for index, value in zip(int_pos, int_vals):
-                    processed[index] = value
-
-            if dd_list:
-                for index, value in zip(star_pos, dd_list):
-                    processed[index] = value
+            # then iteratively decompose as evenly as possible until decomposing
+            # to the number of `rem_procs`
+            star_vals = [1] * len(items)
+            star_i = 0
+            while rem_procs > 1:
+                prime_factors = primefactors(rem_procs)
+                rem_procs //= max(prime_factors)
+                star_vals[star_i] *= max(prime_factors)
+                star_i = (star_i + 1) % nstars
+
+            # Apply computed star values to the processed
+            for index, value in zip(star_pos, star_vals):
+                processed[index] = value
 
         # Final check that topology matches the communicator size
-        try:
-            assert np.prod(processed) == input_comm.size
-        except:
-            raise ValueError("Invalid `topology`", processed, " for given nprocs:",
-                             input_comm.size)
+        assert np.prod(processed) == input_comm.size
 
         obj = super().__new__(cls, processed)
         obj.logical = items
diff --git a/tests/test_mpi.py b/tests/test_mpi.py
@@ -12,17 +12,13 @@
                            retrieve_iteration_tree)
 from devito.mpi import MPI
 from devito.mpi.routines import HaloUpdateCall, HaloUpdateList, MPICall
+from devito.mpi.distributed import CustomTopology
+from devito.tools import Bunch
 from examples.seismic.acoustic import acoustic_setup
 
 pytestmark = skipif(['nompi'], whole_module=True)
 
 
-class DummyInputComm():
-    "Helper class for modelling a communicator with a specific size"
-    def __init__(self, size):
-        self.size = size
-
-
 class TestDistributor(object):
 
     @pytest.mark.parallel(mode=[2, 4])
@@ -193,12 +189,9 @@ def test_custom_topology(self):
         (2, (1, '*', '*'), (1, 2, 1)),
         (2, (2, '*', '*'), (2, 1, 1)),
         (3, (1, '*', '*'), (1, 3, 1)),
-        (3, ('*', 1, '*'), (3, 1, 1)),
         (3, ('*', '*', 1), (3, 1, 1)),
         (4, (2, '*', '*'), (2, 2, 1)),
-        (4, ('*', 2, '*'), (2, 2, 1)),
         (4, ('*', '*', 2), (2, 1, 2)),
-        (6, ('*', 1, '*'), (3, 1, 2)),
         (6, ('*', '*', 1), (3, 2, 1)),
         (6, (1, '*', '*'), (1, 3, 2)),
         (6, ('*', '*', '*'), (3, 2, 1)),
@@ -211,29 +204,23 @@ def test_custom_topology(self):
         (32, ('*', '*', '*'), (4, 4, 2)),
         (8, ('*', 1, '*'), (4, 1, 2)),
         (8, ('*', '*', 1), (4, 2, 1)),
-        (8, (1, '*', '*'), (1, 4, 2)),
         (8, ('*', '*', '*'), (2, 2, 2)),
         (9, ('*', '*', '*'), (3, 3, 1)),
         (11, (1, '*', '*'), (1, 11, 1)),
         (22, ('*', '*', '*'), (11, 2, 1)),
-        (16, ('*', '*', 1), (4, 4, 1)),
         (16, ('*', 1, '*'), (4, 1, 4)),
         (32, ('*', '*', 1), (8, 4, 1)),
-        (64, ('*', '*', '*'), (4, 4, 4)),
         (64, ('*', '*', 1), (8, 8, 1)),
-        (64, ('*', 2, 1), (32, 2, 1)),
         (64, ('*', 2, 4), (8, 2, 4)),
         (128, ('*', '*', 1), (16, 8, 1)),
         (231, ('*', '*', '*'), (11, 7, 3)),
         (256, (1, '*', '*'), (1, 16, 16)),
-        (256, ('*', 1, '*'), (16, 1, 16)),
-        (256, ('*', '*', 1), (16, 16, 1)),
         (256, ('*', '*', '*'), (8, 8, 4)),
         (256, ('*', '*', 2), (16, 8, 2)),
         (256, ('*', 32, 2), (4, 32, 2)),
     ])
     def test_custom_topology_3d_dummy(self, comm_size, topology, dist_topology):
-        dummy_comm = DummyInputComm(comm_size)
+        dummy_comm = Bunch(size=comm_size)
         custom_topology = CustomTopology(topology, dummy_comm)
         assert custom_topology == dist_topology