From 6a761dfb09fc4342a8ff120566c3d59ef74b0d4c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sat, 25 Feb 2023 15:36:44 +0100 Subject: [PATCH 1/2] Drop ray support and delete references in docs --- .github/workflows/run_tests.yml | 2 +- README.md | 50 --- doc/api/smartsim_api.rst | 17 - doc/changelog.rst | 4 + doc/index.rst | 2 - doc/installation.rst | 3 - docker/prod/Dockerfile | 1 - setup.py | 3 - smartsim/_core/control/controller.py | 4 +- smartsim/_core/control/manifest.py | 29 +- smartsim/_core/entrypoints/ray.py | 116 ------ smartsim/_core/generation/generator.py | 1 - smartsim/exp/ray/__init__.py | 27 -- smartsim/exp/ray/raycluster.py | 467 ---------------------- smartsim/experiment.py | 4 - tests/full_wlm/with_ray/test_ray_batch.py | 111 ----- tests/test_manifest.py | 19 +- tests/with_ray/test_ray.py | 233 ----------- tutorials/ray/starting_ray.ipynb | 289 ------------- 19 files changed, 18 insertions(+), 1364 deletions(-) delete mode 100644 smartsim/_core/entrypoints/ray.py delete mode 100644 smartsim/exp/ray/__init__.py delete mode 100644 smartsim/exp/ray/raycluster.py delete mode 100644 tests/full_wlm/with_ray/test_ray_batch.py delete mode 100644 tests/with_ray/test_ray.py delete mode 100644 tutorials/ray/starting_ray.ipynb diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 05ec237e1..d8f31bc2a 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -105,7 +105,7 @@ jobs: - name: Install SmartSim (with ML backends) run: | python -m pip install git+https://github.com/CrayLabs/SmartRedis.git@develop#egg=smartredis - python -m pip install .[dev,ml,ray] + python -m pip install .[dev,ml] - name: Install ML Runtimes with Smart if: contains( matrix.os, 'macos' ) diff --git a/README.md b/README.md index 207989024..ddf8b9382 100644 --- a/README.md +++ b/README.md @@ -69,8 +69,6 @@ exchanged between applications at runtime without the utilization of MPI. - [Local Launch](#local-launch) - [Interactive Launch](#interactive-launch) - [Batch Launch](#batch-launch) - - [Ray](#ray) - - [Ray on HPC](#ray-on-hpc) - [SmartRedis](#smartredis) - [Tensors](#tensors) - [Datasets](#datasets) @@ -284,7 +282,6 @@ initialization. Local launching does not support batch workloads. # Infrastructure Library Applications - Orchestrator - In-memory data store and Machine Learning Inference (Redis + RedisAI) - - Ray - Distributed Reinforcement Learning (RL), Hyperparameter Optimization (HPO) ## Redis + RedisAI @@ -398,53 +395,6 @@ exp.stop(db_cluster) python run_db_batch.py ``` ------ -## Ray - -Ray is a distributed computation framework that supports a number of applications - - RLlib - Distributed Reinforcement Learning (RL) - - RaySGD - Distributed Training - - Ray Tune - Hyperparameter Optimization (HPO) - - Ray Serve - ML/DL inference -As well as other integrations with frameworks like Modin, Mars, Dask, and Spark. - -Historically, Ray has not been well supported on HPC systems. A few examples exist, -but none are well maintained. Because SmartSim already has launchers for HPC systems, -launching Ray through SmartSim is a relatively simple task. - -### Ray on HPC - -Below is an example of how to launch a Ray cluster on an HPC system and connect to it. -In this example, we set `batch=True`, which means that the cluster will be started -requesting an allocation through the scheduler (Slurm, PBS, etc). If this code -is run within a sufficiently large interactive allocation, setting `batch=False` -will spin the Ray cluster on the allocated nodes. - -```Python -import ray - -from smartsim import Experiment -from smartsim.exp.ray import RayCluster - -exp = Experiment("ray-cluster", launcher='auto') -# 3 workers + 1 head node = 4 node-cluster -cluster = RayCluster(name="ray-cluster", run_args={}, - ray_args={"num-cpus": 24}, - launcher='auto', num_nodes=4, batch=True) - -exp.generate(cluster, overwrite=True) -exp.start(cluster, block=False, summary=True) - -# Connect to the Ray cluster -ctx = ray.init(f"ray://{cluster.get_head_address()}:10001") - -# -``` - -*New in 0.4.0* the auto argument enables the Ray Cluster to be launched -across scheduler types. Both batch launch and interactive launch commands -will be automatically detected and used by SmartSim. - ------ # SmartRedis diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index 8dcf98276..9a836606e 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -538,20 +538,3 @@ Slurm .. automodule:: smartsim.slurm :members: - -Ray -=== - -.. currentmodule:: smartsim.exp.ray - -.. _ray_api: - -``RayCluster`` is used to launch a Ray cluster - and can be launched as a batch or in an interactive allocation. - -.. autoclass:: RayCluster - :show-inheritance: - :members: - :inherited-members: - :undoc-members: - :exclude-members: batch set_path type diff --git a/doc/changelog.rst b/doc/changelog.rst index f42879ce7..b0d535146 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -26,6 +26,7 @@ Description - Fix bug in colocated database entrypoint when loading PyTorch models - Add support for RedisAI 1.2.7, pyTorch 1.11.0, Tensorflow 2.8.0, ONNXRuntime 1.11.1 - Allow for models to be launched independently as batch jobs +- Drop support for Ray Detailed Notes @@ -38,6 +39,9 @@ Detailed Notes satisfied, the `Experiment` will attempt to wrap the underlying run command in a batch job using the object referenced at `Model.batch_settings` as the batch settings for the job. If the check is not satisfied, the `Model` is launched in the traditional manner as a job step. (PR245_) +- The support for Ray was dropped, as its most recent versions caused problems when deployed through SmartSim. + We plan to release a separate add-on library to accomplish the same results. If + you are interested in getting the Ray launch functionality back in your workflow, please get in touch with us! .. _PR255: https://github.com/CrayLabs/SmartSim/pull/258 .. _PR245: https://github.com/CrayLabs/SmartSim/pull/245 diff --git a/doc/index.rst b/doc/index.rst index 4ae1b3c5a..661fd7b2f 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -23,8 +23,6 @@ tutorials/online_analysis/lattice/online_analysis tutorials/ml_inference/Inference-in-SmartSim tutorials/ml_training/surrogate/train_surrogate - tutorials/ray/starting_ray - .. toctree:: :maxdepth: 2 diff --git a/doc/installation.rst b/doc/installation.rst index 982d50fb5..7a51de6d2 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -124,11 +124,8 @@ can request their installation through the ``ml`` flag as follows: .. code-block:: bash pip install smartsim[ml] - # add ray extra if you would like to use ray with SmartSim as well - pip install smartsim[ml,ray] # or if using ZSH pip install smartsim\[ml\] - pip install smartsim\[ml,ray\] At this point, SmartSim is installed and can be used for more basic features. diff --git a/docker/prod/Dockerfile b/docker/prod/Dockerfile index 77ba8520e..75e437963 100644 --- a/docker/prod/Dockerfile +++ b/docker/prod/Dockerfile @@ -52,5 +52,4 @@ RUN python -m pip install smartsim[ml]==0.4.1 jupyter jupyterlab matplotlib && \ rm -rf ~/.cache/pip # remove non-jupyter notebook tutorials -RUN rm -rf /home/craylabs/tutorials/ray CMD ["/bin/bash", "-c", "PATH=/home/craylabs/.local/bin:$PATH /home/craylabs/.local/bin/jupyter lab --port 8888 --no-browser --ip=0.0.0.0"] diff --git a/setup.py b/setup.py index aeadf7b97..e5dd3c150 100644 --- a/setup.py +++ b/setup.py @@ -184,9 +184,6 @@ def has_ext_modules(_placeholder): ], # see smartsim/_core/_install/buildenv.py for more details "ml": versions.ml_extras_required(), - "ray": [ - "ray==1.6", - ], } diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index c32c9776f..c70585271 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -290,15 +290,13 @@ def _launch(self, manifest): raise SmartSimError(msg) self._launch_orchestrator(orchestrator) - for rc in manifest.ray_clusters: # cov-wlm - rc._update_workers() if self.orchestrator_active: self._set_dbobjects(manifest) # create all steps prior to launch steps = [] - all_entity_lists = manifest.ensembles + manifest.ray_clusters + all_entity_lists = manifest.ensembles for elist in all_entity_lists: if elist.batch: batch_step = self._create_batch_job_step(elist) diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index d76289966..90f27d565 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -27,13 +27,12 @@ from ...database import Orchestrator from ...entity import EntityList, SmartSimEntity from ...error import SmartSimError -from ...exp.ray import RayCluster from ..utils.helpers import fmt_dict # List of types derived from EntityList which require specific behavior # A corresponding property needs to exist (like db for Orchestrator), # otherwise they will not be accessible -entity_list_exception_types = [Orchestrator, RayCluster] +entity_list_exception_types = [Orchestrator] class Manifest: @@ -51,6 +50,7 @@ def __init__(self, *args): self._check_names(self._deployables) self._check_entity_lists_nonempty() + @property def db(self): """Return Orchestrator instances in Manifest @@ -69,6 +69,7 @@ def db(self): _db = deployable return _db + @property def models(self): """Return Model instances in Manifest @@ -82,6 +83,7 @@ def models(self): _models.append(deployable) return _models + @property def ensembles(self): """Return Ensemble instances in Manifest @@ -101,34 +103,23 @@ def ensembles(self): return _ensembles - @property - def ray_clusters(self): - """Return all RayCluster instances in Manifest - - :return: list of RayCluster instances - :rtype: List[RayCluster] - """ - _ray_cluster = [] - for deployable in self._deployables: - if isinstance(deployable, RayCluster): - _ray_cluster.append(deployable) - return _ray_cluster @property def all_entity_lists(self): """All entity lists, including ensembles and - exceptional ones like Orchestrator and RayCluster + exceptional ones like Orchestrator :return: list of entity lists :rtype: List[EntityList] """ - _all_entity_lists = self.ray_clusters + self.ensembles + _all_entity_lists = self.ensembles db = self.db if db is not None: _all_entity_lists.append(db) return _all_entity_lists + def _check_names(self, deployables): used = [] for deployable in deployables: @@ -139,6 +130,7 @@ def _check_names(self, deployables): raise SmartSimError("User provided two entities with the same name") used.append(name) + def _check_types(self, deployables): for deployable in deployables: if not ( @@ -149,6 +141,7 @@ def _check_types(self, deployables): f"Entity has type {type(deployable)}, not SmartSimEntity or EntityList" ) + def _check_entity_lists_nonempty(self): """Check deployables for sanity before launching""" @@ -156,6 +149,7 @@ def _check_entity_lists_nonempty(self): if len(entity_list) < 1: raise ValueError(f"{entity_list.name} is empty. Nothing to launch.") + def __str__(self): s = "" e_header = "=== Ensembles ===\n" @@ -164,8 +158,7 @@ def __str__(self): if self.ensembles: s += e_header - # include ray clusters as an ensemble while still in experimental API - all_ensembles = self.ensembles + self.ray_clusters + all_ensembles = self.ensembles for ensemble in all_ensembles: s += f"{ensemble.name}\n" s += f"Members: {len(ensemble)}\n" diff --git a/smartsim/_core/entrypoints/ray.py b/smartsim/_core/entrypoints/ray.py deleted file mode 100644 index 77c7440b4..000000000 --- a/smartsim/_core/entrypoints/ray.py +++ /dev/null @@ -1,116 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023 Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse -import os -from subprocess import PIPE, STDOUT, Popen -from typing import List - -from smartsim._core.utils.network import current_ip -from smartsim.exp.ray import parse_ray_head_node_address - - -def main( - network_interface: str, - port: int, - is_head: bool, - password: str, - ray_exe: str, - ray_args: List[str], - dash_port: str, - head_log: str, -): - - ip_address = current_ip(network_interface) - - cliargs = [ - ray_exe, - "start", - "--head" - if is_head - else f"--address={parse_ray_head_node_address(head_log)}:{port}", - "--block", - f"--node-ip-address={ip_address}", - ] - - if ray_args: - cliargs += ray_args - if is_head and not any( - [arg.startswith("--dashboard-host") for arg in ray_args] - ): - cliargs += [f"--dashboard-host={ip_address}"] - - if password: - cliargs += [f"--redis-password={password}"] - - if is_head: - cliargs += [f"--port={port}", f"--dashboard-port={dash_port}"] - - cmd = " ".join(cliargs) - print(f"Ray Command: {cmd}") - - p = Popen(cliargs, stdout=PIPE, stderr=STDOUT) - - for line in iter(p.stdout.readline, b""): - print(line.decode("utf-8").rstrip(), flush=True) - - -if __name__ == "__main__": - - os.environ["PYTHONUNBUFFERED"] = "1" - - parser = argparse.ArgumentParser( - prefix_chars="+", description="SmartSim Ray head launcher" - ) - parser.add_argument( - "+port", type=int, help="Port used by Ray to start the Redis server at" - ) - parser.add_argument("+head", action="store_true") - parser.add_argument("+redis-password", type=str, help="Password of Redis cluster") - parser.add_argument( - "+ray-args", action="append", help="Additional arguments to start Ray" - ) - parser.add_argument("+dashboard-port", type=str, help="Ray dashboard port") - parser.add_argument("+ray-exe", type=str, help="Ray executable", default="ray") - parser.add_argument("+ifname", type=str, help="Interface name", default="lo") - parser.add_argument("+head-log", type=str, help="Head node log") - args = parser.parse_args() - - if not args.head and not args.head_log: - raise argparse.ArgumentError( - "Ray starter needs +head or +head-log to start head or worker nodes respectively" - ) - - main( - args.ifname, - args.port, - args.head, - args.redis_password, - args.ray_exe, - args.ray_args, - args.dashboard_port, - args.head_log, - ) diff --git a/smartsim/_core/generation/generator.py b/smartsim/_core/generation/generator.py index ac06fe0cd..d1628cbb8 100644 --- a/smartsim/_core/generation/generator.py +++ b/smartsim/_core/generation/generator.py @@ -86,7 +86,6 @@ def generate_experiment(self, *args): self._gen_orc_dir(generator_manifest.db) self._gen_entity_list_dir(generator_manifest.ensembles) self._gen_entity_dirs(generator_manifest.models) - self._gen_entity_list_dir(generator_manifest.ray_clusters) def set_tag(self, tag, regex=None): """Set the tag used for tagging input files diff --git a/smartsim/exp/ray/__init__.py b/smartsim/exp/ray/__init__.py deleted file mode 100644 index ae6153808..000000000 --- a/smartsim/exp/ray/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from .raycluster import RayCluster, parse_ray_head_node_address diff --git a/smartsim/exp/ray/raycluster.py b/smartsim/exp/ray/raycluster.py deleted file mode 100644 index bed58e5c5..000000000 --- a/smartsim/exp/ray/raycluster.py +++ /dev/null @@ -1,467 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import os -import re -import sys -import time as _time -import uuid - -from ..._core.utils import init_default -from ..._core.utils.helpers import expand_exe_path -from ...entity import EntityList, SmartSimEntity -from ...error import SmartSimError, SSUnsupportedError -from ...log import get_logger -from ...settings import settings - -logger = get_logger(__name__) - - -class RayCluster(EntityList): - """Entity used to run a Ray cluster on a given number of hosts. One Ray node is - launched on each host, and the first host is used to launch the head node. - - :param name: The name of the entity. - :type name: str - :param path: Path to output, error, and configuration files - :type path: str - :param ray_port: Port at which the head node will be running. - :type ray_port: int - :param ray_args: Arguments to be passed to Ray executable. - :type ray_args: dict[str,str] - :param num_nodes: Number of hosts, includes 1 head node and all worker nodes. - :type num_nodes: int - :param run_args: Arguments to pass to launcher to specify details such as partition or time. - :type run_args: dict[str,str] - :param batch_args: Additional batch arguments passed to launcher when running batch jobs. - :type batch_args: dict[str,str] - :param launcher: Name of launcher to use for starting the cluster. - :type launcher: str - :param interface: Name of network interface the cluster nodes should bind to. - :type interface: str - :param alloc: ID of allocation to run on, if obtained with ``smartsim.slurm.get_allocation`` - :type alloc: int - :param batch: Whether cluster should be launched as batch file, ignored when ``launcher`` is `local` - :type batch: bool - :param time: The walltime the cluster will be running for - :type time: str - :param run_command: Specify launch binary, defaults to automatic selection. - :type run_command: str - :param hosts: Specify hosts to launch on, defaults to None. Optional if not launching with OpenMPI. - :type hosts: str, list[str] - :param password: Password to use for Redis server, which is passed as `--redis_password` to `ray start`. - Can be set to - - `auto`: a strong password will be generated internally - - a string: it will be used as password - - `None`: the default Ray password will be used. - Defaults to `auto` - :type password: str - """ - - def __init__( - self, - name, - path=os.getcwd(), - ray_port=6789, - ray_args=None, - num_nodes=1, - run_args=None, - batch_args=None, - launcher="local", - batch=False, - time="01:00:00", - interface="ipogif0", - alloc=None, - run_command=None, - host_list=None, - password="auto", - **kwargs, - ): - launcher = launcher.lower() - supported_launchers = ["slurm", "pbs", "cobalt"] - if launcher not in supported_launchers: - raise SSUnsupportedError( - "The supported launchers for RayCluster are", - *[f"{launcher_name}," for launcher_name in supported_launchers], - f"but {launcher} was provided.", - ) - - if password: - if password == "auto": - self._ray_password = str(uuid.uuid4()) - else: - self._ray_password = password - else: - self._ray_password = None - - if num_nodes < 1: - raise ValueError("Number of nodes must be larger than 0.") - - self.alloc = None - self.batch_settings = None - self._hosts = None - - run_args = init_default({}, run_args, dict) - batch_args = init_default({}, batch_args, dict) - ray_args = init_default({}, ray_args, dict) - - self._ray_args = ray_args - super().__init__( - name=name, - path=path, - ray_args=ray_args, - run_args=run_args, - ray_port=ray_port, - launcher=launcher, - interface=interface, - alloc=alloc, - num_nodes=num_nodes, - run_command=run_command if run_command else "auto", - host_list=host_list, - **kwargs, - ) - if batch: - self.batch_settings = settings.create_batch_settings( - launcher=launcher, - nodes=num_nodes, - time=time, - batch_args=batch_args, - **kwargs, - ) - self.ray_head_address = None - - if host_list: - self.set_hosts(host_list=host_list, launcher=launcher) - - @property - def batch(self): - try: - if self.batch_settings: - return True - return False - except AttributeError: - return False - - def set_hosts(self, host_list, launcher): - """Specify the hosts for the ``RayCluster`` to launch on. This is - optional, unless ``run_command`` is `mpirun`. - - :param host_list: list of hosts (compute node names) - :type host_list: str | list[str] - :raises TypeError: if wrong type - """ - if isinstance(host_list, str): - host_list = [host_list.strip()] - if not isinstance(host_list, list): - raise TypeError("host_list argument must be a list of strings") - if not all([isinstance(host, str) for host in host_list]): - raise TypeError("host_list argument must be list of strings") - # TODO check length - if self.batch: - self.batch_settings.set_hostlist(host_list) - for host, node in zip(host_list, self.entities): - # Aprun doesn't like settings hosts in batch launch - if launcher == "pbs" or launcher == "cobalt": - if not self.batch: - node.run_settings.set_hostlist([host]) - else: - node.run_settings.set_hostlist([host]) - - def _initialize_entities(self, **kwargs): - - ray_port = kwargs.get("ray_port", 6789) - launcher = kwargs.get("launcher", "slurm") - ray_args = kwargs.get("ray_args", None) - run_args = kwargs.get("run_args", None) - interface = kwargs.get("interface", "ipogif0") - num_nodes = kwargs.get("num_nodes", 0) - alloc = kwargs.get("alloc", None) - run_command = kwargs.get("run_command", None) - - ray_head = RayHead( - name="ray_head", - path=self.path, - ray_password=self._ray_password, - ray_port=ray_port, - launcher=launcher, - run_args=run_args.copy(), - ray_args=ray_args.copy(), - interface=interface, - run_command=run_command, - alloc=alloc, - ) - - self.entities.append(ray_head) - - for worker_id in range(num_nodes - 1): - worker_model = RayWorker( - name=f"ray_worker_{worker_id}", - path=self.path, - run_args=run_args.copy(), - ray_port=ray_port, - ray_password=self._ray_password, - ray_args=ray_args.copy(), - interface=interface, - run_command=run_command, - launcher=launcher, - alloc=alloc, - ) - self.entities.append(worker_model) - - def get_head_address(self): - """Return address of head node - - If address has not been initialized, returns None - - :returns: Address of head node - :rtype: str - """ - if not self.ray_head_address: - self.ray_head_address = parse_ray_head_node_address( - os.path.join(self.entities[0].path, self.entities[0].name + ".out") - ) - return self.ray_head_address - - def get_dashboard_address(self): - """Returns dashboard address - - The format is : - - :returns: Dashboard address - :rtype: str - """ - return self.get_head_address() + ":" + str(self.entities[0].dashboard_port) - - def _update_workers(self): - """Update worker args before launching them.""" - for worker in range(1, len(self.entities)): - self.entities[worker].set_head_log( - f"{os.path.join(self.entities[0].path, self.entities[0].name)}.out" - ) - - -def find_ray_exe(): - """Find ray executable in current path.""" - # TODO add this to CONFIG? - try: - return expand_exe_path("ray") - except (TypeError, FileNotFoundError): - raise SmartSimError("Could not find ray executable") - - -def parse_ray_head_node_address(head_log): - """Get the ray head node host address from the log file produced - by the head process. - - :param head_log: full path to log file of head node - :return: address of the head host - :rtype: str - """ - - max_attempts = 24 - attempts = 0 - while not os.path.isfile(head_log): - _time.sleep(5) - attempts += 1 - if attempts == max_attempts: - raise RuntimeError(f"Could not find Ray cluster head log file {head_log}") - - attempts = 0 - head_ip = None - while head_ip is None: - _time.sleep(5) - with open(head_log) as fp: - line = fp.readline() - while line: - plain_line = re.sub("\033\\[([0-9]+)(;[0-9]+)*m", "", line) - if "Local node IP:" in plain_line: - matches = re.search(r"(?<=Local node IP: ).*", plain_line) - head_ip = matches.group() - break - line = fp.readline() - attempts += 1 - if attempts == max_attempts: - raise RuntimeError( - f"Could not find Ray cluster head address in log file {head_log}." - ) - - return head_ip - - -class RayHead(SmartSimEntity): - def __init__( - self, - name, - path, - ray_password, - ray_port=6789, - run_args=None, - ray_args=None, - launcher="slurm", - interface="ipogif0", - run_command=None, - alloc=None, - dash_port=8265, - **kwargs, - ): - self.dashboard_port = dash_port - self.batch_settings = None - self.files = None - - run_args = init_default({}, run_args, dict) - ray_args = init_default({}, ray_args, dict) - - ray_exe_args = self._build_ray_exe_args( - ray_port, ray_password, interface, ray_args - ) - - run_settings = settings.create_run_settings( - launcher=launcher, - exe="python", - exe_args=ray_exe_args, - run_args=run_args, - run_command=run_command if run_command else "auto", - alloc=alloc, - **kwargs, - ) - - run_settings.set_tasks_per_node(1) - run_settings.set_tasks(1) - - super().__init__(name, path, run_settings) - - def _build_ray_exe_args(self, ray_port, ray_password, interface, ray_args): - - # python script that launches ray head node - ray_starter_args = [ - "-m", - "smartsim._core.entrypoints.ray", - f"+port={ray_port}", - f"+ifname={interface}", - f"+ray-exe={find_ray_exe()}", - f"+head", - ] - - if ray_password: - ray_starter_args += [f"+redis-password={ray_password}"] - - if "dashboard-port" in ray_args: - self.dashboard_port = int(ray_args["dashboard-port"]) - ray_starter_args += [f"+dashboard-port={self.dashboard_port}"] - - used = ["block", "redis-password", "start", "head", "port", "dashboard-port"] - extra_ray_args = [] - for key, value in ray_args.items(): - if key not in used: - extra_ray_args += [f"+ray-args=--{key}={value}"] - ray_starter_args += extra_ray_args - - return " ".join(ray_starter_args) - - -class RayWorker(SmartSimEntity): - def __init__( - self, - name, - path, - ray_password, - ray_port, - run_args=None, - ray_args=None, - interface="ipogif0", - launcher="slurm", - run_command=None, - alloc=None, - **kwargs, - ): - - self.batch_settings = None - self.files = None - - run_args = init_default({}, run_args, dict) - ray_args = init_default({}, ray_args, dict) - - ray_exe_args = self._build_ray_exe_args( - ray_password, ray_args, ray_port, interface - ) - - run_settings = settings.create_run_settings( - launcher=launcher, - exe=sys.executable, - exe_args=ray_exe_args, - run_args=run_args, - run_command=run_command, - alloc=alloc, - **kwargs, - ) - - run_settings.set_tasks_per_node(1) - run_settings.set_tasks(1) - - super().__init__(name, path, run_settings) - - @property - def batch(self): - return False - - def set_head_log(self, head_log): - """Set head log file (with full path) - - The head log file is used by the worker to discover - the head IP address. This function is called by - RayCluster before the cluster is launched. - """ - self.run_settings.add_exe_args([f"+head-log={head_log}"]) - - def _build_ray_exe_args(self, ray_password, ray_args, ray_port, interface): - - # python script that launches ray node - ray_starter_args = [ - "-m", - "smartsim._core.entrypoints.ray", - f"+ray-exe={find_ray_exe()}", - f"+port={ray_port}", - f"+ifname={interface}", - ] - if ray_password: - ray_starter_args += [f"+redis-password={ray_password}"] - - used = [ - "block", - "redis-password", - "start", - "head", - "port", - "dashboard-port", - "dashboard-host", - ] - extra_ray_args = [] - for key, value in ray_args.items(): - if key not in used: - extra_ray_args += [f"+ray-args=--{key}={value}"] - ray_starter_args += extra_ray_args - - return " ".join(ray_starter_args) diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 3ac4f5422..63a52b760 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -788,10 +788,6 @@ def _launch_summary(self, manifest): summary += f"Experiment: {self.name}\n" summary += f"Experiment Path: {self.exp_path}\n" summary += f"Launcher: {self._launcher}\n" - if manifest.ensembles or manifest.ray_clusters: - summary += ( - f"Ensembles: {len(manifest.ensembles) + len(manifest.ray_clusters)}\n" - ) if manifest.models: summary += f"Models: {len(manifest.models)}\n" diff --git a/tests/full_wlm/with_ray/test_ray_batch.py b/tests/full_wlm/with_ray/test_ray_batch.py deleted file mode 100644 index fe8bbaf24..000000000 --- a/tests/full_wlm/with_ray/test_ray_batch.py +++ /dev/null @@ -1,111 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import logging -import os.path as osp -import sys -import time -from os import environ - -import pytest - -from smartsim import Experiment -from smartsim._core.launcher import slurm -from smartsim.exp.ray import RayCluster - -"""Test Ray cluster batch launch and shutdown. -""" - -# retrieved from pytest fixtures -if pytest.test_launcher not in pytest.wlm_options: - pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") - -environ["OMP_NUM_THREADS"] = "1" -shouldrun = True -try: - import ray -except ImportError: - shouldrun = False - - -pytestmark = pytest.mark.skipif( - not shouldrun, - reason="requires Ray", -) - - -def test_ray_launch_and_shutdown_batch(fileutils, wlmutils, caplog): - launcher = wlmutils.get_test_launcher() - if launcher == "local": - pytest.skip("Test cannot be run with local launcher") - - caplog.set_level(logging.CRITICAL) - test_dir = fileutils.make_test_dir() - - exp = Experiment("ray-cluster", test_dir, launcher=launcher) - cluster = RayCluster( - name="ray-cluster", - run_args={}, - ray_args={"num-cpus": 4}, - launcher=launcher, - num_nodes=2, - batch=True, - interface=wlmutils.get_test_interface(), - batch_args={"A": wlmutils.get_test_account(), "queue": "debug-flat-quad"} - if launcher == "cobalt" - else None, - time="00:05:00", - ) - - exp.generate(cluster) - - try: - exp.start(cluster, block=False, summary=True) - ctx = ray.init("ray://" + cluster.get_head_address() + ":10001") - - right_resources = False - trials = 10 - while not right_resources and trials > 0: - right_resources = (len(ray.nodes()), ray.cluster_resources()["CPU"]) == ( - 2, - 8, - ) - trials -= 1 - time.sleep(1) - - if not right_resources: - ctx.disconnect() - ray.shutdown() - exp.stop(cluster) - assert False - - ctx.disconnect() - ray.shutdown() - exp.stop(cluster) - except: - # Catch all errors, most of which can come from Ray - exp.stop(cluster) - assert False diff --git a/tests/test_manifest.py b/tests/test_manifest.py index cb5fcacfa..7a856aea7 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -33,22 +33,13 @@ from smartsim._core.control import Manifest from smartsim.database import Orchestrator from smartsim.error import SmartSimError -from smartsim.exp.ray import RayCluster from smartsim.settings import RunSettings -# Ensure tensorflow is imported before ray. This is a workaround -# for a seg fault happening in the CI on Ubuntu when ray was being -# imported before tensorflow try: import tensorflow except ImportError: pass -ray_ok = True -try: - import ray -except ImportError: - ray_ok = False # ---- create entities for testing -------- @@ -64,23 +55,15 @@ orc_1 = deepcopy(orc) orc_1.name = "orc2" model_no_name = exp.create_model(name=None, run_settings=rs) -if ray_ok: - rc = RayCluster(name="ray-cluster", workers=0, launcher="slurm", run_command="srun") def test_separate(): - if ray_ok: - manifest = Manifest(model, ensemble, orc, rc) - else: - manifest = Manifest(model, ensemble, orc) + manifest = Manifest(model, ensemble, orc) assert manifest.models[0] == model assert len(manifest.models) == 1 assert manifest.ensembles[0] == ensemble assert len(manifest.ensembles) == 1 assert manifest.db == orc - if ray_ok: - assert len(manifest.ray_clusters) == 1 - assert manifest.ray_clusters[0] == rc def test_no_name(): diff --git a/tests/with_ray/test_ray.py b/tests/with_ray/test_ray.py deleted file mode 100644 index cd93b4618..000000000 --- a/tests/with_ray/test_ray.py +++ /dev/null @@ -1,233 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import logging -import time -from os import environ - -import psutil -import pytest - -from smartsim import Experiment -from smartsim.error import SSUnsupportedError -from smartsim.exp.ray import RayCluster -from smartsim.wlm import slurm - -"""Test Ray cluster launch and shutdown. -""" - -# retrieved from pytest fixtures -if pytest.test_launcher not in pytest.wlm_options: - pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") - - -environ["OMP_NUM_THREADS"] = "1" -shouldrun = True -try: - import ray -except ImportError: - shouldrun = False - - -pytestmark = pytest.mark.skipif( - not shouldrun, - reason="requires Ray", -) - - -def test_ray_launch_and_shutdown_wlm(fileutils, wlmutils, caplog): - launcher = wlmutils.get_test_launcher() - if launcher == "local": - pytest.skip("Test can not be run on local launcher") - - caplog.set_level(logging.CRITICAL) - test_dir = fileutils.make_test_dir() - - exp = Experiment("ray-cluster", test_dir, launcher=launcher) - cluster = RayCluster( - name="ray-cluster", - run_args={}, - ray_args={"num-cpus": 4}, - launcher=launcher, - num_nodes=2, - alloc=None, - batch=False, - time="00:05:00", - interface=wlmutils.get_test_interface(), - ) - - exp.generate(cluster) - exp.start(cluster, block=False, summary=False) - ctx = ray.init("ray://" + cluster.get_head_address() + ":10001") - - right_resources = False - trials = 10 - while not right_resources and trials > 0: - right_resources = (len(ray.nodes()), ray.cluster_resources()["CPU"]) == (2, 8) - trials -= 1 - time.sleep(1) - - if not right_resources: - ctx.disconnect() - ray.shutdown() - exp.stop(cluster) - assert False - - ctx.disconnect() - ray.shutdown() - exp.stop(cluster) - - -def test_ray_launch_and_shutdown_in_alloc(fileutils, wlmutils, caplog): - launcher = wlmutils.get_test_launcher() - if launcher != "slurm": - pytest.skip("Test only runs on systems with Slurm as WLM") - if "SLURM_JOBID" in environ: - pytest.skip("Test can not be run inside an allocation") - - caplog.set_level(logging.CRITICAL) - test_dir = fileutils.make_test_dir() - - alloc = slurm.get_allocation(4, time="00:05:00") - - exp = Experiment("ray-cluster", test_dir, launcher=launcher) - cluster = RayCluster( - name="ray-cluster", - run_args={}, - ray_args={"num-cpus": 4, "dashboard-port": "8266"}, - launcher=launcher, - workers=2, - alloc=alloc, - batch=False, - interface=wlmutils.get_test_interface(), - ) - - exp.generate(cluster) - exp.start(cluster, block=False, summary=False) - ctx = ray.init("ray://" + cluster.get_head_address() + ":10001") - - right_resources = False - trials = 10 - while not right_resources and trials > 0: - right_resources = (len(ray.nodes()), ray.cluster_resources()["CPU"]) == (3, 12) - trials -= 1 - time.sleep(1) - - assert cluster.get_dashboard_address() == cluster.get_head_address() + ":8266" - - if not right_resources: - ctx.disconnect() - ray.shutdown() - exp.stop(cluster) - slurm.release_allocation(alloc) - assert False - - ctx.disconnect() - ray.shutdown() - exp.stop(cluster) - slurm.release_allocation(alloc) - - -def test_ray_errors(fileutils): - """Try to start a local Ray cluster with incorrect settings.""" - - test_dir = fileutils.make_test_dir() - - with pytest.raises(SSUnsupportedError): - _ = RayCluster( - name="local-ray-cluster", - path=test_dir, - run_args={}, - launcher="local", - num_nodes=1, - ) - - with pytest.raises(ValueError): - _ = RayCluster( - name="small-ray-cluster", - path=test_dir, - run_args={}, - launcher="slurm", - num_nodes=0, - ) - - -@pytest.mark.skip(reason="Local launch is currently disabled for Ray") -def test_ray_local_launch_and_shutdown(fileutils, caplog): - """Start a local (single node) Ray cluster and - shut it down. - """ - # Avoid Ray output - caplog.set_level(logging.CRITICAL) - - test_dir = fileutils.make_test_dir() - - exp = Experiment("ray-cluster", launcher="local", exp_path=test_dir) - cluster = RayCluster( - name="ray-cluster", - run_args={}, - launcher="local", - ray_port=6830, - num_nodes=1, - batch=True, - ray_args={"num-cpus": "4", "dashboard-port": "8266"}, - ) - exp.generate(cluster, overwrite=False) - exp.start(cluster, block=False, summary=False) - - ray.init("ray://" + cluster.get_head_address() + ":10001") - - right_size = len(ray.nodes()) == 1 - if not right_size: - ray.shutdown() - exp.stop(cluster) - assert False - - right_resources = ray.cluster_resources()["CPU"] == 4 - if not right_resources: - ray.shutdown() - exp.stop(cluster) - assert False - - # Even setting batch to True must result in cluster.batch==False on local - if cluster.batch: - ray.shutdown() - exp.stop(cluster) - assert False - - ray.shutdown() - exp.stop(cluster) - - raylet_active = False - for proc in psutil.process_iter(): - try: - if "raylet" in proc.name().lower(): - raylet_active = True - except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): - pass - assert not raylet_active - - assert cluster.get_dashboard_address() == cluster.get_head_address() + ":8266" diff --git a/tutorials/ray/starting_ray.ipynb b/tutorials/ray/starting_ray.ipynb deleted file mode 100644 index 5f289e118..000000000 --- a/tutorials/ray/starting_ray.ipynb +++ /dev/null @@ -1,289 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "4cba3240", - "metadata": {}, - "source": [ - "# Ray Integration" - ] - }, - { - "cell_type": "markdown", - "id": "624cb31c", - "metadata": {}, - "source": [ - "## Starting a Ray Cluster with SmartSim\n", - "\n", - "Before we can begin starting up a Cluster, we first import the relevant modules. We will also define some global variables for clarity and ease of use:\n", - "\n", - " 1. `NUM_NODES` is the number of Ray nodes we will deploy with the first one will be the head node. We will run one node on each host.\n", - " 2. `CPUS_PER_WORKER` is number of cpus to be used by each worker in the cluster\n", - " 3. `LAUNCHER` is the workload manager that our SmartSim experiment and ray cluster will use" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "bf6b043d", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import os\n", - "import ray\n", - "from ray import tune\n", - "import ray.util\n", - "\n", - "from smartsim import Experiment\n", - "from smartsim.exp.ray import RayCluster\n", - "\n", - "NUM_NODES = 3\n", - "CPUS_PER_WORKER = 18\n", - "LAUNCHER='slurm'" - ] - }, - { - "cell_type": "markdown", - "id": "713f5f27", - "metadata": {}, - "source": [ - "Now, we instance a SmartSim experiment with the name `\"ray-cluster\"`, which we will spin up the Ray cluster. By doing so we will create a `ray-cluster` directory (relative to the path from where we are executing this notebook). The output files generated by the experment will be located in the `ray-cluster` directory. \n", - "\n", - "Next, we will instance a `RayCluster` to connect to the cluster. We are limiting the number each ray node can use to `CPUS_PER_WORKER`. If we wanted to let it use all the CPUs, it would suffice not to pass `ray_args`.\n", - "Notice that the cluster will be password-protected (the password, generated internally, will be shared with worker nodes).\n", - "\n", - "If the hosts are attached to multiple interfaces (e.g. `ib`, `eth0`, ...), we can specify to which one the Ray nodes should bind by setting the `interface` argument; it is recommended to always choose the one offering the best performances. On a Cray XC, for example, this will be `ipogif0`. \n", - "\n", - "Note that this approach only works with `ray>=1.6`. For previous versions, you have to add `password=None` to the `RayCluster` constructor." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a8851bff", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "exp = Experiment(\"ray-cluster\", launcher=LAUNCHER)\n", - "cluster = RayCluster(\n", - " name=\"ray-cluster\",\n", - " run_args={},\n", - " ray_args={\"num-cpus\": CPUS_PER_WORKER},\n", - " launcher=LAUNCHER,\n", - " num_nodes=NUM_NODES,\n", - " batch=False,\n", - " interface=\"ipogif0\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "a28512f9", - "metadata": {}, - "source": [ - "We now generate the needed directories. If an experiment with the same name already exists, this call will fail to avoid overwriting existing results. If we want to overwrite, we can simply pass `overwrite=True` to `exp.generate()`." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "30c66187", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "exp.generate(cluster, overwrite=True)" - ] - }, - { - "cell_type": "markdown", - "id": "5ddd1af8", - "metadata": {}, - "source": [ - "Now we are ready to start the cluster!" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "088251d3", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "exp.start(cluster, block=False, summary=False)" - ] - }, - { - "cell_type": "markdown", - "id": "847a4a74", - "metadata": {}, - "source": [ - "## Connect to the Ray Cluster\n", - "\n", - "Now we can just connect to our running server." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "2a90ff89", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ctx = ray.init(f\"ray://{cluster.get_head_address()}:10001\")" - ] - }, - { - "cell_type": "markdown", - "id": "c6401082", - "metadata": {}, - "source": [ - "We can check that all resources are set properly." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c17e5555", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(\n", - " (\n", - " \"This cluster consists of\\n\"\n", - " f\"{len(ray.nodes())} nodes in total\\n\"\n", - " f\"{ray.cluster_resources()['CPU']} CPU resources in total\\n\"\n", - " f\"and the head node is running at {cluster.get_head_address()}\"\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "4f6663d4", - "metadata": {}, - "source": [ - "We can run a Ray Tune example, to see that everything is working." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "1f08fc6a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "tune.run(\n", - " \"PPO\",\n", - " stop={\"episode_reward_max\": 200},\n", - " config={\n", - " \"framework\": \"torch\",\n", - " \"env\": \"CartPole-v0\",\n", - " \"num_gpus\": 0,\n", - " \"lr\": tune.grid_search(np.linspace (0.001, 0.01, 50).tolist()),\n", - " \"log_level\": \"ERROR\",\n", - " },\n", - " local_dir=os.path.join(exp.exp_path, \"ray_log\"),\n", - " verbose=0,\n", - " fail_fast=True,\n", - " log_to_file=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "66e52249", - "metadata": {}, - "source": [ - "When the Ray job is running, we can connect to the Ray dashboard to monitor the evolution of the experiment. To do this, if Ray is running on a compute node of a remote system, we need to setup a SSH tunnel (we will see later how), to forward the port on which the dashboard is published to our local system. For example, if the head address (printed in the cell above) is ``, and the system name is ``, we can establish a tunnel to the dashboard opening a terminal on the local system and entering:\n", - "\n", - "```bash\n", - "ssh -L 8265::8265 \n", - "```\n", - "\n", - "Then, from a browser on the local system, we can go to the address `http://localhost:8265` to see the dashboard.\n", - "\n", - "There are two things to know if something does not work:\n", - "\n", - "1. We are using `8265` as a port, which is the default dashboard port. If that port is not free, we can bind the dashboard to another port, e.g. `PORT_NUMBER` (by adding `\"dashboard-port\": str(PORT_NUMBER)` to `ray_args` when creating the cluster) and the command changed accordingly.\n", - "\n", - "2. If the port forwarding fails, it is possible that the interface is not reachable. In that case, you can add `\"dashboard-address\": \"0.0.0.0\"` to `ray_args` when creating the cluster, to bind the dashboard to all interfaces, or select a visible address if one knows it. You can then use the node name (or its public IP) to establish the tunnel, by entering (on the local terminal):\n", - " ```bash \n", - " ssh -L 8265::8265 \n", - " ```\n", - "Please refer to your system guide to find out how you can get the name and the address of a node." - ] - }, - { - "cell_type": "markdown", - "id": "6da5f0a5", - "metadata": {}, - "source": [ - "## Stop Cluster and Release Resources\n", - "\n", - "When we are finished with the cluster and ready to deallocate resources, we must first shut down the Ray runtime, followed by disconnecting the context." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4961f1d6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ray.shutdown()\n", - "ctx.disconnect()" - ] - }, - { - "cell_type": "markdown", - "id": "97d167bb", - "metadata": {}, - "source": [ - "Now that all is gracefully stopped, we can stop the job on the allocation." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "f19f7b95", - "metadata": {}, - "outputs": [], - "source": [ - "exp.stop(cluster)" - ] - } - ], - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 4b19f8a33108ab3e5378df96edaaa4a4d4e5b74b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 28 Feb 2023 20:12:54 +0100 Subject: [PATCH 2/2] Update changelog.rst Fix changelog order and links --- doc/changelog.rst | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index b0d535146..b4c29f235 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -22,28 +22,29 @@ This section details changes made in the development branch that have not yet be Description +- Drop support for Ray +- Allow for models to be launched independently as batch jobs - Update to current version of Redis - Fix bug in colocated database entrypoint when loading PyTorch models - Add support for RedisAI 1.2.7, pyTorch 1.11.0, Tensorflow 2.8.0, ONNXRuntime 1.11.1 -- Allow for models to be launched independently as batch jobs -- Drop support for Ray Detailed Notes +- The support for Ray was dropped, as its most recent versions caused problems when deployed through SmartSim. + We plan to release a separate add-on library to accomplish the same results. If + you are interested in getting the Ray launch functionality back in your workflow, please get in touch with us! (PR263_) - Update from Redis version 6.0.8 to 7.0.5. (PR258_) -- Fix bug in colocated database entrypoint stemming from uninitialized variables. This bug affects PyTorch models being loaded into the database. (PR237_) -- The release of RedisAI 1.2.7 allows us to update support for recent versions of pyTorch, Tensorflow, and ONNX (PR234_) -- Make installation of correct Torch backend more reliable according to instruction from pyTorch - Models were given a `batch_settings` attribute. When launching a model through `Experiment.start` the `Experiment` will first check for a non-nullish value at that attribute. If the check is satisfied, the `Experiment` will attempt to wrap the underlying run command in a batch job using the object referenced at `Model.batch_settings` as the batch settings for the job. If the check is not satisfied, the `Model` is launched in the traditional manner as a job step. (PR245_) -- The support for Ray was dropped, as its most recent versions caused problems when deployed through SmartSim. - We plan to release a separate add-on library to accomplish the same results. If - you are interested in getting the Ray launch functionality back in your workflow, please get in touch with us! +- Fix bug in colocated database entrypoint stemming from uninitialized variables. This bug affects PyTorch models being loaded into the database. (PR237_) +- The release of RedisAI 1.2.7 allows us to update support for recent versions of pyTorch, Tensorflow, and ONNX (PR234_) +- Make installation of correct Torch backend more reliable according to instruction from pyTorch -.. _PR255: https://github.com/CrayLabs/SmartSim/pull/258 +.. _PR263: https://github.com/CrayLabs/SmartSim/pull/263 +.. _PR258: https://github.com/CrayLabs/SmartSim/pull/258 .. _PR245: https://github.com/CrayLabs/SmartSim/pull/245 .. _PR237: https://github.com/CrayLabs/SmartSim/pull/237 .. _PR234: https://github.com/CrayLabs/SmartSim/pull/234