diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 522808d4a..214b2ae91 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -12,7 +12,7 @@ env: HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON" HOMEBREW_NO_GITHUB_API: "ON" HOMEBREW_NO_INSTALL_CLEANUP: "ON" - + DEBIAN_FRONTEND: "noninteractive" # Disable interactive apt install sessions jobs: run_tests: @@ -47,11 +47,29 @@ jobs: sudo apt-get install -y wget - name: Install GNU make for MacOS and set GITHUB_PATH - if: "contains( matrix.os, 'macos' )" + if: contains( matrix.os, 'macos' ) run: | brew install make || true echo "$(brew --prefix)/opt/make/libexec/gnubin" >> $GITHUB_PATH + - name: Build Singularity from source + if: contains( matrix.os, 'ubuntu' ) && matrix.py_v == 3.9 && matrix.rai == '1.2.5' + run: | + sudo apt-get install -y libseccomp-dev pkg-config squashfs-tools cryptsetup curl git # wget build-essential + echo 'export PATH=/usr/local/go/bin:$PATH' >> ~/.bashrc + source ~/.bashrc + export VERSION=1.0.0 # Apptainer (singularity) version + wget https://github.com/apptainer/apptainer/releases/download/v${VERSION}/apptainer-${VERSION}.tar.gz + tar -xzf apptainer-${VERSION}.tar.gz + cd apptainer-${VERSION} + ./mconfig + make -C builddir + sudo make -C builddir install + + - name: singularity pull test container # This lets us time how long the pull takes + if: contains( matrix.os, 'ubuntu' ) && matrix.py_v == 3.9 && matrix.rai == '1.2.5' + run: singularity pull docker://alrigazzi/smartsim-testing + - name: Install SmartSim (with ML backends) run: python -m pip install .[dev,ml,ray] diff --git a/docker/testing/Dockerfile b/docker/testing/Dockerfile new file mode 100644 index 000000000..52b74601a --- /dev/null +++ b/docker/testing/Dockerfile @@ -0,0 +1,8 @@ +# syntax=docker/dockerfile:1 +FROM ubuntu:21.10 +ENV DEBIAN_FRONTEND noninteractive +RUN apt update && apt install -y python3 python3-pip python-is-python3 cmake git +RUN pip install torch==1.9.1 +RUN git clone https://github.com/CrayLabs/SmartRedis.git +RUN cd SmartRedis && pip install . && make lib; cd .. + diff --git a/docker/testing/README.md b/docker/testing/README.md new file mode 100644 index 000000000..a034afde1 --- /dev/null +++ b/docker/testing/README.md @@ -0,0 +1,38 @@ +# container-testing + +This container is hosted on dockerhub to be used for SmartSim container +integration testing. Below are the commands to push an updated version of +the container. + +## Building and interacting with container locally + +```sh +# Build container +docker build -t container-testing . + +# Start a shell on container to try things out +docker run -it container-testing bash +``` + +Within the container, you can verify that you can import packages like +smartredis or pytorch locally. + +## Pushing container updates to DockerHub repository + +Note: is bumped each time an update is pushed. +Versions have no relation to SmartSim versions. + +```sh +# See current versions to determine next version +docker image inspect --format '{{.RepoTags}}' alrigazzi/smartsim-testing + +docker login + +# Create tags for current build of container +docker image tag container-testing alrigazzi/smartsim-testing:latest +docker image tag container-testing alrigazzi/smartsim-testing: + +# Push current build of container with all tags created +docker image push --all-tags alrigazzi/smartsim-testing +``` + diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index e9c730082..acdf210e4 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -75,6 +75,9 @@ def get_launch_cmd(self): launch_script_path = self.get_colocated_launch_script() aprun_cmd.extend([bash, launch_script_path]) + if self.run_settings.container: + aprun_cmd += self.run_settings.container._container_cmds() + aprun_cmd += self._build_exe() # if its in a batch, redirect stdout to diff --git a/smartsim/_core/launcher/step/localStep.py b/smartsim/_core/launcher/step/localStep.py index 2628a0e8d..0c5337d54 100644 --- a/smartsim/_core/launcher/step/localStep.py +++ b/smartsim/_core/launcher/step/localStep.py @@ -53,6 +53,9 @@ def get_launch_cmd(self): launch_script_path = self.get_colocated_launch_script() cmd.extend([bash, launch_script_path]) + if self.run_settings.container: + cmd += self.run_settings.container._container_cmds() + # build executable cmd.extend(self.run_settings.exe) if self.run_settings.exe_args: diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index f388827cf..0ebacc8d2 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -157,6 +157,9 @@ def get_launch_cmd(self): launch_script_path = self.get_colocated_launch_script() srun_cmd += [bash, launch_script_path] + if self.run_settings.container: + srun_cmd += self.run_settings.container._container_cmds() + srun_cmd += self._build_exe() return srun_cmd diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 357439cea..1e91cbf70 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -512,6 +512,7 @@ def create_run_settings( run_command="auto", run_args=None, env_vars=None, + container=None, **kwargs, ): """Create a ``RunSettings`` instance. @@ -558,6 +559,7 @@ class in SmartSim. If found, the class corresponding run_command=run_command, run_args=run_args, env_vars=env_vars, + container=container, **kwargs, ) except SmartSimError as e: diff --git a/smartsim/settings/__init__.py b/smartsim/settings/__init__.py index 82985e59a..cf18fec48 100644 --- a/smartsim/settings/__init__.py +++ b/smartsim/settings/__init__.py @@ -5,6 +5,7 @@ from .mpirunSettings import MpiexecSettings, MpirunSettings, OrterunSettings from .pbsSettings import QsubBatchSettings from .slurmSettings import SbatchSettings, SrunSettings +from .containers import Container, Singularity __all__ = [ "AprunSettings", @@ -18,4 +19,6 @@ "RunSettings", "SbatchSettings", "SrunSettings", + "Container", + "Singularity", ] diff --git a/smartsim/settings/alpsSettings.py b/smartsim/settings/alpsSettings.py index 0ad4a011c..2234fac5c 100644 --- a/smartsim/settings/alpsSettings.py +++ b/smartsim/settings/alpsSettings.py @@ -67,6 +67,10 @@ def make_mpmd(self, aprun_settings): raise SSUnsupportedError( "Colocated models cannot be run as a mpmd workload" ) + if self.container: + raise SSUnsupportedError( + "Containerized MPMD workloads are not yet supported." + ) self.mpmd.append(aprun_settings) def set_cpus_per_task(self, cpus_per_task): diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index af000db09..0082eaf21 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -1,5 +1,4 @@ -# BSD 2-Clause License -# +# BSD 2-Clause License # # Copyright (c) 2021-2022, Hewlett Packard Enterprise # All rights reserved. # @@ -38,6 +37,7 @@ def __init__( run_command="", run_args=None, env_vars=None, + container=None, **kwargs, ): """Run parameters for a ``Model`` @@ -69,11 +69,19 @@ def __init__( :type run_args: dict[str, str], optional :param env_vars: environment vars to launch job with, defaults to None :type env_vars: dict[str, str], optional + :param container: container type for workload (e.g. "singularity"), defaults to None + :type container: Container, optional """ - self.exe = [expand_exe_path(exe)] + # Do not expand executable if running within a container + if container: + self.exe = [exe] + else: + self.exe = [expand_exe_path(exe)] + self.exe_args = self._set_exe_args(exe_args) self.run_args = init_default({}, run_args, dict) self.env_vars = init_default({}, env_vars, dict) + self.container = container self._run_command = run_command self.in_batch = False self.colocated_db_settings = None @@ -344,13 +352,15 @@ def run_command(self): :returns: launch binary e.g. mpiexec :type: str | None """ - if self._run_command: - if is_valid_cmd(self._run_command): + cmd = self._run_command + + if cmd: + if is_valid_cmd(cmd): # command is valid and will be expanded - return expand_exe_path(self._run_command) + return expand_exe_path(cmd) # command is not valid, so return it as is # it may be on the compute nodes but not local machine - return self._run_command + return cmd # run without run command return None diff --git a/smartsim/settings/containers.py b/smartsim/settings/containers.py new file mode 100644 index 000000000..c917e3791 --- /dev/null +++ b/smartsim/settings/containers.py @@ -0,0 +1,116 @@ +import shutil +from ..log import get_logger + +logger = get_logger(__name__) + +class Container(): + '''Base class for container types in SmartSim. + + Container types are used to embed all the information needed to + launch a workload within a container into a single object. + + :param image: local or remote path to container image + :type image: str + :param args: arguments to container command + :type args: str | list[str], optional + :param mount: paths to mount (bind) from host machine into image. + :type mount: str | list[str] | dict[str, str], optional + ''' + + def __init__(self, image, args='', mount=''): + # Validate types + if not isinstance(image, str): + raise TypeError('image must be a str') + elif not isinstance(args, (str, list)): + raise TypeError('args must be a str | list') + elif not isinstance(mount, (str, list, dict)): + raise TypeError('mount must be a str | list | dict') + + self.image = image + self.args = args + self.mount = mount + + def _containerized_run_command(self, run_command: str): + '''Return modified run_command with container commands prepended. + + :param run_command: run command from a RunSettings class + :type run_command: str + ''' + raise NotImplementedError(f"Containerized run command specification not implemented for this Container type: {type(self)}") + + +class Singularity(Container): + '''Singularity (apptainer) container type. + + .. note:: + + Singularity integration is currently tested with + `Apptainer 1.0 `_ + with slurm and PBS workload managers only. + + Also, note that user-defined bind paths (``mount`` argument) may be + disabled by a + `system administrator `_ + + + :param image: local or remote path to container image, e.g. 'docker://sylabsio/lolcow' + :type image: str + :param args: arguments to 'singularity exec' command + :type args: str | list[str], optional + :param mount: paths to mount (bind) from host machine into image. + :type mount: str | list[str] | dict[str, str], optional + ''' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _container_cmds(self): + '''Return list of container commands to be inserted before exe. + Container members are validated during this call. + + :raises TypeError: if object members are invalid types + ''' + serialized_args = '' + if self.args: + # Serialize args into a str + if isinstance(self.args, str): + serialized_args = self.args + elif isinstance(self.args, list): + serialized_args = ' '.join(self.args) + else: + raise TypeError('self.args must be a str | list') + + serialized_mount = '' + if self.mount: + if isinstance(self.mount, str): + serialized_mount = self.mount + elif isinstance(self.mount, list): + serialized_mount = ','.join(self.mount) + elif isinstance(self.mount, dict): + paths = [] + for host_path,img_path in self.mount.items(): + if img_path: + paths.append(f'{host_path}:{img_path}') + else: + paths.append(host_path) + serialized_mount = ','.join(paths) + else: + raise TypeError('self.mount must be str | list | dict') + + # Find full path to singularity + singularity = shutil.which('singularity') + + # Some systems have singularity available on compute nodes only, + # so warn instead of error + if not singularity: + logger.warning('Unable to find singularity. Continuing in case singularity is available on compute node') + + # Construct containerized launch command + cmd_list = [singularity, 'exec'] + if serialized_args: + cmd_list.append(serialized_args) + if serialized_mount: + cmd_list.extend(['--bind', serialized_mount]) + cmd_list.append(self.image) + + return cmd_list diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index f26e4a008..b90ed4d08 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -95,6 +95,7 @@ def create_run_settings( run_command="auto", run_args=None, env_vars=None, + container=None, **kwargs, ): """Create a ``RunSettings`` instance. @@ -163,9 +164,9 @@ def _detect_command(launcher): # if user specified and supported or auto detection worked if run_command and run_command in supported: - return supported[run_command](exe, exe_args, run_args, env_vars, **kwargs) + return supported[run_command](exe, exe_args, run_args, env_vars, container=container, **kwargs) # 1) user specified and not implementation in SmartSim # 2) user supplied run_command=None # 3) local launcher being used and default of "auto" was passed. - return RunSettings(exe, exe_args, run_command, run_args, env_vars) + return RunSettings(exe, exe_args, run_command, run_args, env_vars, container=container) diff --git a/smartsim/settings/slurmSettings.py b/smartsim/settings/slurmSettings.py index 929ccdc33..e07465b28 100644 --- a/smartsim/settings/slurmSettings.py +++ b/smartsim/settings/slurmSettings.py @@ -89,6 +89,10 @@ def make_mpmd(self, srun_settings): raise SSUnsupportedError( "Colocated models cannot be run as a mpmd workload" ) + if self.container: + raise SSUnsupportedError( + "Containerized MPMD workloads are not yet supported." + ) self.mpmd.append(srun_settings) def set_hostlist(self, host_list): diff --git a/tests/on_wlm/test_containers_wlm.py b/tests/on_wlm/test_containers_wlm.py new file mode 100644 index 000000000..1e0c2b9a4 --- /dev/null +++ b/tests/on_wlm/test_containers_wlm.py @@ -0,0 +1,72 @@ +import pytest +from shutil import which + +from smartsim import Experiment, status +from smartsim.entity import Ensemble +from smartsim.settings.containers import Singularity + +"""Test SmartRedis container integration on a supercomputer with a WLM.""" + +# Check if singularity is available as command line tool +singularity_exists = which('singularity') is not None +containerURI = 'docker://alrigazzi/smartsim-testing:latest' + +@pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") +def test_singularity_wlm_smartredis(fileutils, wlmutils): + """Run two processes, each process puts a tensor on + the DB, then accesses the other process's tensor. + Finally, the tensor is used to run a model. + + Note: This is a containerized port of test_smartredis.py for WLM system + """ + + launcher = wlmutils.get_test_launcher() + print(launcher) + if launcher not in ["pbs", "slurm"]: + pytest.skip( + f"Test only runs on systems with PBS or Slurm as WLM. Current launcher: {launcher}" + ) + + test_dir = fileutils.make_test_dir() + exp = Experiment( + "smartredis_ensemble_exchange", exp_path=test_dir, launcher=launcher + ) + + # create and start a database + orc = exp.create_database() + exp.generate() + exp.start(orc, block=False) + + container = Singularity(containerURI) + rs = exp.create_run_settings("python3", "producer.py --exchange", container=container) + rs.set_tasks(1) + params = {"mult": [1, -10]} + ensemble = Ensemble( + name="producer", + params=params, + run_settings=rs, + perm_strat="step", + ) + + ensemble.register_incoming_entity(ensemble["producer_0"]) + ensemble.register_incoming_entity(ensemble["producer_1"]) + + config = fileutils.get_test_conf_path("smartredis") + ensemble.attach_generator_files(to_copy=[config]) + + exp.generate(ensemble) + + # start the models + exp.start(ensemble, summary=False) + + # get and confirm statuses + statuses = exp.get_status(ensemble) + if not all([stat == status.STATUS_COMPLETED for stat in statuses]): + exp.stop(orc) + assert False # client ensemble failed + + # stop the orchestrator + exp.stop(orc) + + print(exp.summary()) + diff --git a/tests/test_configs/check_dirs.py b/tests/test_configs/check_dirs.py new file mode 100644 index 000000000..535330646 --- /dev/null +++ b/tests/test_configs/check_dirs.py @@ -0,0 +1,13 @@ +import os +from pathlib import Path + +''' +Verify home directory only contains a single directory. +This script is intended to be run by a container test with a test directory +mounted into the $HOME directory. +''' + +directories=os.listdir(str(Path.home())) +print(directories) +assert len(directories) == 1 + diff --git a/tests/test_containers.py b/tests/test_containers.py new file mode 100644 index 000000000..4d6c5cd31 --- /dev/null +++ b/tests/test_containers.py @@ -0,0 +1,154 @@ +import pytest +from shutil import which +from pathlib import Path +import os + +from smartsim import Experiment, status +from smartsim._core.utils import installed_redisai_backends +from smartsim.database import Orchestrator +from smartsim.entity import Ensemble, Model +from smartsim.settings.containers import Singularity + +# Check if singularity is available as command line tool +singularity_exists = which('singularity') is not None +containerURI = 'docker://alrigazzi/smartsim-testing:latest' + +def test_singularity_commands(fileutils): + '''Test generation of singularity commands.''' + + # Note: We skip first element so singularity is not needed to run test + + c = Singularity(containerURI) + cmd = ' '.join(c._container_cmds()[1:]) + assert cmd == f'exec {containerURI}' + + c = Singularity(containerURI, args='--verbose') + cmd = ' '.join(c._container_cmds()[1:]) + assert cmd == f'exec --verbose {containerURI}' + + c = Singularity(containerURI, args=['--verbose', '--cleanenv']) + cmd = ' '.join(c._container_cmds()[1:]) + assert cmd == f'exec --verbose --cleanenv {containerURI}' + + c = Singularity(containerURI, mount='/usr/local/bin') + cmd = ' '.join(c._container_cmds()[1:]) + assert cmd == f'exec --bind /usr/local/bin {containerURI}' + + c = Singularity(containerURI, mount=['/usr/local/bin', '/lus/datasets']) + cmd = ' '.join(c._container_cmds()[1:]) + assert cmd == f'exec --bind /usr/local/bin,/lus/datasets {containerURI}' + + c = Singularity(containerURI, mount={'/usr/local/bin':'/bin', + '/lus/datasets':'/datasets', + '/cray/css/smartsim':None}) + cmd = ' '.join(c._container_cmds()[1:]) + assert cmd == f'exec --bind /usr/local/bin:/bin,/lus/datasets:/datasets,/cray/css/smartsim {containerURI}' + + c = Singularity(containerURI, args='--verbose', mount='/usr/local/bin') + cmd = ' '.join(c._container_cmds()[1:]) + assert cmd == f'exec --verbose --bind /usr/local/bin {containerURI}' + + +@pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") +def test_singularity_basic(fileutils): + '''Basic argument-less Singularity test''' + test_dir = fileutils.make_test_dir() + + container = Singularity(containerURI) + + exp = Experiment("singularity_basic", exp_path=test_dir, launcher="local") + run_settings = exp.create_run_settings("python3", "sleep.py --time=3", + container=container) + model = exp.create_model("singularity_basic", run_settings) + + script = fileutils.get_test_conf_path("sleep.py") + model.attach_generator_files(to_copy=[script]) + exp.generate(model) + + exp.start(model, summary=False) + + # get and confirm status + stat = exp.get_status(model)[0] + assert stat == status.STATUS_COMPLETED + + print(exp.summary()) + + +@pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") +def test_singularity_args(fileutils): + '''Test combinations of args and mount arguments for Singularity''' + test_dir = fileutils.make_test_dir() + hometest_dir = os.path.join(str(Path.home()), 'test') # $HOME/test + mount_paths = {test_dir + '/singularity_args': hometest_dir} + container = Singularity(containerURI, args='--contain', mount=mount_paths) + + exp = Experiment("singularity_args", launcher="local", exp_path=test_dir) + + run_settings = exp.create_run_settings('python3', 'test/check_dirs.py', + container=container) + model = exp.create_model("singularity_args", run_settings) + script = fileutils.get_test_conf_path("check_dirs.py") + model.attach_generator_files(to_copy=[script]) + exp.generate(model) + + exp.start(model, summary=False) + + # get and confirm status + stat = exp.get_status(model)[0] + assert stat == status.STATUS_COMPLETED + + print(exp.summary()) + + +@pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") +def test_singularity_smartredis(fileutils, wlmutils): + """Run two processes, each process puts a tensor on + the DB, then accesses the other process's tensor. + Finally, the tensor is used to run a model. + + Note: This is a containerized port of test_smartredis.py + """ + + test_dir = fileutils.make_test_dir() + exp = Experiment( + "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" + ) + + # create and start a database + orc = Orchestrator(port=wlmutils.get_test_port()) + exp.generate(orc) + exp.start(orc, block=False) + + container = Singularity(containerURI) + + rs = exp.create_run_settings("python3", "producer.py --exchange", container=container) + params = {"mult": [1, -10]} + ensemble = Ensemble( + name="producer", + params=params, + run_settings=rs, + perm_strat="step", + ) + + ensemble.register_incoming_entity(ensemble["producer_0"]) + ensemble.register_incoming_entity(ensemble["producer_1"]) + + config = fileutils.get_test_conf_path("smartredis") + ensemble.attach_generator_files(to_copy=[config]) + + exp.generate(ensemble) + + # start the models + exp.start(ensemble, summary=False) + + # get and confirm statuses + statuses = exp.get_status(ensemble) + if not all([stat == status.STATUS_COMPLETED for stat in statuses]): + exp.stop(orc) + assert False # client ensemble failed + + # stop the orchestrator + exp.stop(orc) + + print(exp.summary()) +