Skip to content

Commit

Permalink
Integrate PalsMpiexecSettings into Experiment factory methods (#343)
Browse files Browse the repository at this point in the history
Registers the `PalsMpiexecSettings` class in SmartsSim's `Experiment`
factory methods so that the correct `_BaseMpiSettings` class is returned
depending on the launcher the user is intending to use.

[ commited by @MattToast ]
[ reviewed by @al-rigazzi @ashao @ankona ]
  • Loading branch information
MattToast authored Aug 25, 2023
1 parent b0b4acb commit f0d510d
Show file tree
Hide file tree
Showing 11 changed files with 286 additions and 110 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ _build

smartredis

# Envs
venv/
.venv/
env/
.env/

# written upon install
smartsim/version.py

Expand Down
38 changes: 21 additions & 17 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def print_test_configuration() -> None:

def pytest_configure() -> None:
pytest.test_launcher = test_launcher
pytest.wlm_options = ["slurm", "pbs", "cobalt", "lsf"]
pytest.wlm_options = ["slurm", "pbs", "cobalt", "lsf", "pals"]
account = get_account()
pytest.test_account = account

Expand Down Expand Up @@ -141,23 +141,13 @@ def get_hostlist() -> t.Optional[t.List[str]]:
if not test_hostlist:
if "COBALT_NODEFILE" in os.environ:
try:
cobalt_fp = os.environ["COBALT_NODEFILE"]
with open(cobalt_fp, "r", encoding="utf-8") as nodefile:
lines = nodefile.readlines()
test_hostlist = list(
dict.fromkeys([line.strip() for line in lines])
)
except Exception:
return _parse_hostlist_file(os.environ["COBALT_NODEFILE"])
except FileNotFoundError:
return None
elif "PBS_NODEFILE" in os.environ and not shutil.which("aprun"):
try:
pbs_fp = os.environ["PBS_NODEFILE"]
with open(pbs_fp, "r", encoding="utf-8") as nodefile:
lines = nodefile.readlines()
test_hostlist = list(
dict.fromkeys([line.strip() for line in lines])
)
except Exception:
return _parse_hostlist_file(os.environ["PBS_NODEFILE"])
except FileNotFoundError:
return None
elif "SLURM_JOB_NODELIST" in os.environ:
try:
Expand All @@ -173,6 +163,11 @@ def get_hostlist() -> t.Optional[t.List[str]]:
return test_hostlist


def _parse_hostlist_file(path: str) -> t.List[str]:
with open(path, "r", encoding="utf-8") as nodefile:
return list({line.strip() for line in nodefile.readlines()})


@pytest.fixture(scope="session")
def alloc_specs() -> t.Dict[str, t.Any]:
specs: t.Dict[str, t.Any] = {}
Expand Down Expand Up @@ -245,6 +240,11 @@ def get_base_run_settings(
exe, args, run_command=run_command, run_args=run_args
)
return settings
if test_launcher == "pals":
host_file = os.environ["PBS_NODEFILE"]
run_args = {"--np": ntasks, "--hostfile": host_file}
run_args.update(kwargs)
return RunSettings(exe, args, run_command="mpiexec", run_args=run_args)
if test_launcher == "cobalt":
if shutil.which("aprun"):
run_command = "aprun"
Expand Down Expand Up @@ -291,7 +291,11 @@ def get_run_settings(
run_args = {"n": ntasks, "hostfile": host_file}
run_args.update(kwargs)
return MpirunSettings(exe, args, run_args=run_args)

if test_launcher == "pals":
host_file = os.environ["PBS_NODEFILE"]
run_args = {"np": ntasks, "hostfile": host_file}
run_args.update(kwargs)
return PalsMpiexecSettings(exe, args, run_args=run_args)
# TODO allow user to pick aprun vs MPIrun
if test_launcher == "cobalt":
if shutil.which("aprun"):
Expand All @@ -316,7 +320,7 @@ def get_run_settings(

@staticmethod
def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator:
if test_launcher in ["pbs", "cobalt"]:
if test_launcher in ["pbs", "cobalt", "pals"]:
if not shutil.which("aprun"):
hostlist = get_hostlist()
else:
Expand Down
3 changes: 2 additions & 1 deletion smartsim/_core/control/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ def init_launcher(self, launcher: str) -> None:
launcher_map: t.Dict[str, t.Type[Launcher]] = {
"slurm": SlurmLauncher,
"pbs": PBSLauncher,
"pals": PBSLauncher,
"cobalt": CobaltLauncher,
"lsf": LSFLauncher,
"local": LocalLauncher,
Expand Down Expand Up @@ -387,7 +388,7 @@ def _launch_orchestrator(self, orchestrator: Orchestrator) -> None:
def _launch_step(
self, job_step: Step, entity: t.Union[SmartSimEntity, EntityList]
) -> None:
"""Use the launcher to launch a job stop
"""Use the launcher to launch a job step
:param job_step: a job step instance
:type job_step: Step
Expand Down
17 changes: 5 additions & 12 deletions smartsim/_core/launcher/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,13 @@ def create_step(
:rtype: Step
"""
try:
settings_class = step_settings.__class__
if settings_class in self.supported_rs:
step_class = self.supported_rs[settings_class]
step = step_class(name, cwd, step_settings)
return step
step_class = self.supported_rs[type(step_settings)]
except KeyError:
raise SSUnsupportedError(
f"RunSettings type {type(step_settings)} not supported by this launcher"
)
) from None
try:
return step_class(name, cwd, step_settings)
except AllocationError as e:
raise LauncherError("Step creation failed") from e

Expand All @@ -124,12 +123,6 @@ def get_step_nodes(
) -> t.List[t.List[str]]: # pragma: no cover
raise SSUnsupportedError("Node acquisition not supported for this launcher")

def run(self, step: Step) -> t.Optional[str]: # pragma: no cover
raise NotImplementedError

def stop(self, step_name: str) -> StepInfo: # pragma: no cover
raise NotImplementedError

def get_step_update(
self, step_names: t.List[str]
) -> t.List[t.Tuple[str, t.Union[StepInfo, None]]]: # cov-wlm
Expand Down
1 change: 1 addition & 0 deletions smartsim/database/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
by_launcher: t.Dict[str, t.List[str]] = {
"slurm": ["srun", "mpirun", "mpiexec"],
"pbs": ["aprun", "mpirun", "mpiexec"],
"pals": ["mpiexec"],
"cobalt": ["aprun", "mpirun", "mpiexec"],
"lsf": ["jsrun"],
"local": [""],
Expand Down
3 changes: 1 addition & 2 deletions smartsim/settings/palsSettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def __init__(
self,
exe: str,
exe_args: t.Optional[t.Union[str, t.List[str]]] = None,
run_command: str = "mpiexec",
run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None,
env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None,
fail_if_missing_exec: bool = True,
Expand Down Expand Up @@ -89,7 +88,7 @@ def __init__(
super().__init__(
exe,
exe_args,
run_command=run_command,
run_command="mpiexec",
run_args=run_args,
env_vars=env_vars,
fail_if_missing_exec=fail_if_missing_exec,
Expand Down
22 changes: 14 additions & 8 deletions smartsim/settings/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,11 @@
MpiexecSettings,
OrterunSettings,
JsrunSettings,
PalsMpiexecSettings,
)

_TRunSettingsSelector = t.Callable[[str], t.Callable[..., RunSettings]]


def create_batch_settings(
launcher: str,
Expand Down Expand Up @@ -144,20 +147,23 @@ def create_run_settings(
:raises SmartSimError: if run_command=="auto" and detection fails
"""
# all supported RunSettings child classes
supported: t.Dict[str, t.Callable[..., RunSettings]] = {
"aprun": AprunSettings,
"srun": SrunSettings,
"mpirun": MpirunSettings,
"mpiexec": MpiexecSettings,
"orterun": OrterunSettings,
"jsrun": JsrunSettings,
supported: t.Dict[str, _TRunSettingsSelector] = {
"aprun": lambda launcher: AprunSettings,
"srun": lambda launcher: SrunSettings,
"mpirun": lambda launcher: MpirunSettings,
"mpiexec": lambda launcher: (
MpiexecSettings if launcher != "pals" else PalsMpiexecSettings
),
"orterun": lambda launcher: OrterunSettings,
"jsrun": lambda launcher: JsrunSettings,
}

# run commands supported by each launcher
# in order of suspected user preference
by_launcher = {
"slurm": ["srun", "mpirun", "mpiexec"],
"pbs": ["aprun", "mpirun", "mpiexec"],
"pals": ["mpiexec"],
"cobalt": ["aprun", "mpirun", "mpiexec"],
"lsf": ["jsrun", "mpirun", "mpiexec"],
"local": [""],
Expand Down Expand Up @@ -192,7 +198,7 @@ def _detect_command(launcher: str) -> str:

# if user specified and supported or auto detection worked
if run_command and run_command in supported:
return supported[run_command](
return supported[run_command](launcher)(
exe, exe_args, run_args, env_vars, container=container, **kwargs
)

Expand Down
31 changes: 31 additions & 0 deletions tests/test_configs/mpi_impl_stubs/pals/mpiexec
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/sh

# BSD 2-Clause License
#
# Copyright (c) 2021-2023, Hewlett Packard Enterprise
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# Stub executable to print out a generic PALS ``mpiexec --version`` message

echo "mpiexec version 1.2.12 revision d3dd612f9372 built Apr 12 2023"
16 changes: 7 additions & 9 deletions tests/test_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import contextlib

import pytest

Expand Down Expand Up @@ -154,15 +155,12 @@ def test_summary(fileutils):
assert 0 == int(row["RunID"])
assert 0 == int(row["Returncode"])

def test_launcher_detection(wlmutils, monkeypatch):
if wlmutils.get_test_launcher() == "pals":
pytest.skip(reason="Launcher detection cannot currently detect pbs vs pals")
if wlmutils.get_test_launcher() == "local":
monkeypatch.setenv("PATH", "") # Remove all WLMs from PATH

def test_launcher_detection(wlmutils):
exp = Experiment("test-launcher-detection", launcher="auto")

# We check whether the right launcher is found. But if
# the test launcher was set to local, we tolerate finding
# another one (this cannot be avoided)
if (
exp._launcher != wlmutils.get_test_launcher()
and wlmutils.get_test_launcher() != "local"
):
assert False
assert exp._launcher == wlmutils.get_test_launcher()
Loading

0 comments on commit f0d510d

Please sign in to comment.