Integrate PalsMpiexecSettings into Experiment factory methods (#343)

Registers the `PalsMpiexecSettings` class in SmartsSim's `Experiment` factory methods so that the correct `_BaseMpiSettings` class is returned depending on the launcher the user is intending to use. [ commited by @MattToast ] [ reviewed by @al-rigazzi @ashao @ankona ]
CrayLabs · Aug 25, 2023 · f0d510d · f0d510d
1 parent b0b4acb
commit f0d510d
Show file tree

Hide file tree

Showing 11 changed files with 286 additions and 110 deletions.
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,12 @@ _build
 
 smartredis
 
+# Envs
+venv/
+.venv/
+env/
+.env/
+
 # written upon install
 smartsim/version.py
 

diff --git a/conftest.py b/conftest.py
@@ -89,7 +89,7 @@ def print_test_configuration() -> None:
 
 def pytest_configure() -> None:
     pytest.test_launcher = test_launcher
-    pytest.wlm_options = ["slurm", "pbs", "cobalt", "lsf"]
+    pytest.wlm_options = ["slurm", "pbs", "cobalt", "lsf", "pals"]
     account = get_account()
     pytest.test_account = account
 
@@ -141,23 +141,13 @@ def get_hostlist() -> t.Optional[t.List[str]]:
     if not test_hostlist:
         if "COBALT_NODEFILE" in os.environ:
             try:
-                cobalt_fp = os.environ["COBALT_NODEFILE"]
-                with open(cobalt_fp, "r", encoding="utf-8") as nodefile:
-                    lines = nodefile.readlines()
-                    test_hostlist = list(
-                        dict.fromkeys([line.strip() for line in lines])
-                    )
-            except Exception:
+                return _parse_hostlist_file(os.environ["COBALT_NODEFILE"])
+            except FileNotFoundError:
                 return None
         elif "PBS_NODEFILE" in os.environ and not shutil.which("aprun"):
             try:
-                pbs_fp = os.environ["PBS_NODEFILE"]
-                with open(pbs_fp, "r", encoding="utf-8") as nodefile:
-                    lines = nodefile.readlines()
-                    test_hostlist = list(
-                        dict.fromkeys([line.strip() for line in lines])
-                    )
-            except Exception:
+                return _parse_hostlist_file(os.environ["PBS_NODEFILE"])
+            except FileNotFoundError:
                 return None
         elif "SLURM_JOB_NODELIST" in os.environ:
             try:
@@ -173,6 +163,11 @@ def get_hostlist() -> t.Optional[t.List[str]]:
     return test_hostlist
 
 
+def _parse_hostlist_file(path: str) -> t.List[str]:
+    with open(path, "r", encoding="utf-8") as nodefile:
+        return list({line.strip() for line in nodefile.readlines()})
+
+
 @pytest.fixture(scope="session")
 def alloc_specs() -> t.Dict[str, t.Any]:
     specs: t.Dict[str, t.Any] = {}
@@ -245,6 +240,11 @@ def get_base_run_settings(
                 exe, args, run_command=run_command, run_args=run_args
             )
             return settings
+        if test_launcher == "pals":
+            host_file = os.environ["PBS_NODEFILE"]
+            run_args = {"--np": ntasks, "--hostfile": host_file}
+            run_args.update(kwargs)
+            return RunSettings(exe, args, run_command="mpiexec", run_args=run_args)
         if test_launcher == "cobalt":
             if shutil.which("aprun"):
                 run_command = "aprun"
@@ -291,7 +291,11 @@ def get_run_settings(
             run_args = {"n": ntasks, "hostfile": host_file}
             run_args.update(kwargs)
             return MpirunSettings(exe, args, run_args=run_args)
-
+        if test_launcher == "pals":
+            host_file = os.environ["PBS_NODEFILE"]
+            run_args = {"np": ntasks, "hostfile": host_file}
+            run_args.update(kwargs)
+            return PalsMpiexecSettings(exe, args, run_args=run_args)
         # TODO allow user to pick aprun vs MPIrun
         if test_launcher == "cobalt":
             if shutil.which("aprun"):
@@ -316,7 +320,7 @@ def get_run_settings(
 
     @staticmethod
     def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator:
-        if test_launcher in ["pbs", "cobalt"]:
+        if test_launcher in ["pbs", "cobalt", "pals"]:
             if not shutil.which("aprun"):
                 hostlist = get_hostlist()
             else:

diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py
@@ -258,6 +258,7 @@ def init_launcher(self, launcher: str) -> None:
         launcher_map: t.Dict[str, t.Type[Launcher]] = {
             "slurm": SlurmLauncher,
             "pbs": PBSLauncher,
+            "pals": PBSLauncher,
             "cobalt": CobaltLauncher,
             "lsf": LSFLauncher,
             "local": LocalLauncher,
@@ -387,7 +388,7 @@ def _launch_orchestrator(self, orchestrator: Orchestrator) -> None:
     def _launch_step(
         self, job_step: Step, entity: t.Union[SmartSimEntity, EntityList]
     ) -> None:
-        """Use the launcher to launch a job stop
+        """Use the launcher to launch a job step
 
         :param job_step: a job step instance
         :type job_step: Step

diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py
@@ -105,14 +105,13 @@ def create_step(
         :rtype: Step
         """
         try:
-            settings_class = step_settings.__class__
-            if settings_class in self.supported_rs:
-                step_class = self.supported_rs[settings_class]
-                step = step_class(name, cwd, step_settings)
-                return step
+            step_class = self.supported_rs[type(step_settings)]
+        except KeyError:
             raise SSUnsupportedError(
                 f"RunSettings type {type(step_settings)} not supported by this launcher"
-            )
+            ) from None
+        try:
+            return step_class(name, cwd, step_settings)
         except AllocationError as e:
             raise LauncherError("Step creation failed") from e
 
@@ -124,12 +123,6 @@ def get_step_nodes(
     ) -> t.List[t.List[str]]:  # pragma: no cover
         raise SSUnsupportedError("Node acquisition not supported for this launcher")
 
-    def run(self, step: Step) -> t.Optional[str]:  # pragma: no cover
-        raise NotImplementedError
-
-    def stop(self, step_name: str) -> StepInfo:  # pragma: no cover
-        raise NotImplementedError
-
     def get_step_update(
         self, step_names: t.List[str]
     ) -> t.List[t.Tuple[str, t.Union[StepInfo, None]]]:  # cov-wlm

diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py
@@ -64,6 +64,7 @@
 by_launcher: t.Dict[str, t.List[str]] = {
     "slurm": ["srun", "mpirun", "mpiexec"],
     "pbs": ["aprun", "mpirun", "mpiexec"],
+    "pals": ["mpiexec"],
     "cobalt": ["aprun", "mpirun", "mpiexec"],
     "lsf": ["jsrun"],
     "local": [""],

diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py
@@ -58,7 +58,6 @@ def __init__(
         self,
         exe: str,
         exe_args: t.Optional[t.Union[str, t.List[str]]] = None,
-        run_command: str = "mpiexec",
         run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None,
         env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None,
         fail_if_missing_exec: bool = True,
@@ -89,7 +88,7 @@ def __init__(
         super().__init__(
             exe,
             exe_args,
-            run_command=run_command,
+            run_command="mpiexec",
             run_args=run_args,
             env_vars=env_vars,
             fail_if_missing_exec=fail_if_missing_exec,

diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py
@@ -43,8 +43,11 @@
     MpiexecSettings,
     OrterunSettings,
     JsrunSettings,
+    PalsMpiexecSettings,
 )
 
+_TRunSettingsSelector = t.Callable[[str], t.Callable[..., RunSettings]]
+
 
 def create_batch_settings(
     launcher: str,
@@ -144,20 +147,23 @@ def create_run_settings(
     :raises SmartSimError: if run_command=="auto" and detection fails
     """
     # all supported RunSettings child classes
-    supported: t.Dict[str, t.Callable[..., RunSettings]] = {
-        "aprun": AprunSettings,
-        "srun": SrunSettings,
-        "mpirun": MpirunSettings,
-        "mpiexec": MpiexecSettings,
-        "orterun": OrterunSettings,
-        "jsrun": JsrunSettings,
+    supported: t.Dict[str, _TRunSettingsSelector] = {
+        "aprun": lambda launcher: AprunSettings,
+        "srun": lambda launcher: SrunSettings,
+        "mpirun": lambda launcher: MpirunSettings,
+        "mpiexec": lambda launcher: (
+            MpiexecSettings if launcher != "pals" else PalsMpiexecSettings
+        ),
+        "orterun": lambda launcher: OrterunSettings,
+        "jsrun": lambda launcher: JsrunSettings,
     }
 
     # run commands supported by each launcher
     # in order of suspected user preference
     by_launcher = {
         "slurm": ["srun", "mpirun", "mpiexec"],
         "pbs": ["aprun", "mpirun", "mpiexec"],
+        "pals": ["mpiexec"],
         "cobalt": ["aprun", "mpirun", "mpiexec"],
         "lsf": ["jsrun", "mpirun", "mpiexec"],
         "local": [""],
@@ -192,7 +198,7 @@ def _detect_command(launcher: str) -> str:
 
     # if user specified and supported or auto detection worked
     if run_command and run_command in supported:
-        return supported[run_command](
+        return supported[run_command](launcher)(
             exe, exe_args, run_args, env_vars, container=container, **kwargs
         )
 

diff --git a/tests/test_configs/mpi_impl_stubs/pals/mpiexec b/tests/test_configs/mpi_impl_stubs/pals/mpiexec
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2023, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Stub executable to print out a generic PALS ``mpiexec --version`` message
+
+echo "mpiexec version 1.2.12 revision d3dd612f9372 built Apr 12 2023"
diff --git a/tests/test_experiment.py b/tests/test_experiment.py
@@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import contextlib
 
 import pytest
 
@@ -154,15 +155,12 @@ def test_summary(fileutils):
     assert 0 == int(row["RunID"])
     assert 0 == int(row["Returncode"])
 
+def test_launcher_detection(wlmutils, monkeypatch):
+    if wlmutils.get_test_launcher() == "pals":
+        pytest.skip(reason="Launcher detection cannot currently detect pbs vs pals")
+    if wlmutils.get_test_launcher() == "local":
+        monkeypatch.setenv("PATH", "")  # Remove all WLMs from PATH
 
-def test_launcher_detection(wlmutils):
     exp = Experiment("test-launcher-detection", launcher="auto")
 
-    # We check whether the right launcher is found. But if
-    # the test launcher was set to local, we tolerate finding
-    # another one (this cannot be avoided)
-    if (
-        exp._launcher != wlmutils.get_test_launcher()
-        and wlmutils.get_test_launcher() != "local"
-    ):
-        assert False
+    assert exp._launcher == wlmutils.get_test_launcher()
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,6 +17,12 @@ _build @@
     smartredis
+    # Envs
+    venv/
+    .venv/
+    env/
+    .env/
     # written upon install
     smartsim/version.py
@@ Expand Down @@