From a6701c04168e2f6bc62bead07b5226e4f16544d3 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 1 Apr 2022 16:14:40 -0500 Subject: [PATCH 01/15] First commit --- smartsim/_core/control/controller.py | 111 ++++++++++++++++++++- smartsim/_core/launcher/colocated.py | 8 +- smartsim/entity/__init__.py | 1 + smartsim/entity/dbobject.py | 143 +++++++++++++++++++++++++++ smartsim/entity/model.py | 99 ++++++++++++++++++- 5 files changed, 356 insertions(+), 6 deletions(-) create mode 100644 smartsim/entity/dbobject.py diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index e41d1f16b..a29a8fa8b 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -29,8 +29,9 @@ import threading import time +from ..._core._cli.utils import get_install_path from ...database import Orchestrator -from ...entity import DBNode, EntityList, SmartSimEntity +from ...entity import DBNode, DBModel, DBObject, DBScript, EntityList, SmartSimEntity from ...error import LauncherError, SmartSimError, SSInternalError, SSUnsupportedError from ...log import get_logger from ...status import STATUS_RUNNING, TERMINAL_STATUSES @@ -39,6 +40,9 @@ from ..utils import check_cluster_status, create_cluster from .jobmanager import JobManager +from smartredis import Client + + logger = get_logger(__name__) # job manager lock @@ -75,6 +79,9 @@ def start(self, manifest, block=True): # start the job manager thread if not already started if not self._jobs.actively_monitoring: self._jobs.start() + + if self.orchestrator_active(): + self.set_dbobjects(manifest) except KeyboardInterrupt: self._jobs.signal_interrupt() @@ -82,7 +89,7 @@ def start(self, manifest, block=True): # block until all non-database jobs are complete if block: - # poll handles it's own keyboard interrupt as + # poll handles its own keyboard interrupt as # it may be called seperately self.poll(5, True) @@ -302,7 +309,7 @@ def _launch(self, manifest): batch_step = self._create_batch_job_step(elist) steps.append((batch_step, elist)) else: - # if ensemble is to be run as seperate job steps, aka not in a batch + # if ensemble is to be run as separate job steps, aka not in a batch job_steps = [(self._create_job_step(e), e) for e in elist.entities] steps.extend(job_steps) @@ -591,3 +598,101 @@ def reload_saved_db(self, checkpoint_file): return orc finally: JM_LOCK.release() + + + def _enumerate_devices(self, dbobject: DBObject): + """Enumerate devices for a DBObject + + :param dbobject: DBObject to enumerate + :type dbobject: DBObject + :return: list of device names + :rtype: list[str] + """ + devices = [] + if dbobject.device in ["CPU", "GPU"] and dbobject.devices_per_node > 1: + for device_num in dbobject.devices_per_node: + devices.append(f"{dbobject.device}:{str(device_num)}") + else: + devices = [dbobject.device] + + + def set_ml_model(self, db_model: DBModel, address, cluster=False): + devices = self._enumerate_devices(db_model) + + redis_cli = (get_install_path / "_core/bin/redis-cli").resolve() + if not redis_cli.is_file(): + raise FileNotFoundError("Could not find redis-cli") + + client = Client(address=address, cluster=cluster) + + for device in devices: + if db_model.is_file: + client.set_model_from_file( + key=db_model.name, + model=db_model.model, + backend=db_model.backend, + device=device, + batch_size=db_model.batch_size, + min_batch_size=db_model.min_batch_size, + tag=db_model.tag, + inputs=db_model.inputs, + outputs=db_model.outputs + ) + else: + client.set_model( + key=db_model.name, + model=db_model.model, + backend=db_model.backend, + device=device, + batch_size=db_model.batch_size, + min_batch_size=db_model.min_batch_size, + tag=db_model.tag, + inputs=db_model.inputs, + outputs=db_model.outputs + ) + # TODO some more error handlings here + + + def set_script(self, db_script:DBScript, address, cluster=False): + devices = self._enumerate_devices(db_script) + # TODO some error handling here + client = Client(address=address, cluster=cluster) + + for device in devices: + if db_script.is_file: + client.set_script_from_file( + key=db_script.name, + script_file=db_script.file, + device=device + ) + else: + client.set_script( + key=db_script.name, + script=db_script.script, + device=device + ) + # TODO some more error handlings here + + + def set_dbobjects(self, manifest): + db_addresses = self._jobs.get_db_host_addresses() + cluster = len(db_addresses) > 1 + address = db_addresses[0] + for model in manifest.models: + for db_model in model._db_models: + self.set_ml_model(db_model, address, cluster) + for db_script in model._db_scripts: + self.set_script(db_script, address, cluster) + + # This allows users to specify per-ensemble and + # per-entity models + for ensemble in manifest.ensembles + manifest.ray_clusters: + for db_model in ensemble._db_models: + self.set_ml_model(db_model, address, cluster) + for db_script in ensemble._db_scripts: + self.set_script(db_script, address, cluster) + for entity in ensemble: + for db_model in entity._db_models: + self.set_ml_model(db_model, address, cluster) + for db_script in entity._db_scripts: + self.set_script(db_script, address, cluster) \ No newline at end of file diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index 990a27789..b7ba7019f 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -65,6 +65,10 @@ def write_colocated_launch_script(file_name, db_log, colocated_settings): f.write(f"{colocated_cmd}\n") f.write(f"DBPID=$!\n\n") + + if colocated_settings["db_models"]: + pass + if colocated_settings["limit_app_cpus"]: cpus = colocated_settings["cpus"] f.write( @@ -129,7 +133,7 @@ def _build_colocated_wrapper_cmd(port=6780, # add extra redisAI configurations for arg, value in rai_args.items(): if value: - # RAI wants arguments for inference in all capps + # RAI wants arguments for inference in all caps # ex. THREADS_PER_QUEUE=1 db_cmd.append(f"{arg.upper()} {str(value)}") @@ -142,7 +146,7 @@ def _build_colocated_wrapper_cmd(port=6780, ]) for db_arg, value in extra_db_args.items(): # replace "_" with "-" in the db_arg because we use kwargs - # for the extra configurations and Python doesn't allow a hypon + # for the extra configurations and Python doesn't allow a hyphen # in a variable name. All redis and KeyDB configuration options # use hyphens in their names. db_arg = db_arg.replace("_", "-") diff --git a/smartsim/entity/__init__.py b/smartsim/entity/__init__.py index 0c8c54c48..ab661a7f1 100644 --- a/smartsim/entity/__init__.py +++ b/smartsim/entity/__init__.py @@ -3,3 +3,4 @@ from .entity import SmartSimEntity from .entityList import EntityList from .model import Model +from .dbobject import DBScript, DBModel, DBObject \ No newline at end of file diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py new file mode 100644 index 000000000..a493bad9a --- /dev/null +++ b/smartsim/entity/dbobject.py @@ -0,0 +1,143 @@ +from pathlib import Path +from .._core.utils.helpers import init_default + + +class DBObject: + def __init__(self, name, func, file_path, device, devices_per_node): + self.name = name + self.func = func + if file_path: + self.file = self._check_filepath(file_path) + self.device = self._check_device(device) + self.devices_per_node = devices_per_node + + def is_file(self): + if self.func: + return False + return True + + @staticmethod + def _check_tensor_args(inputs, outputs): + inputs = init_default([], inputs, (list, str)) + outputs = init_default([], outputs, (list, str)) + if isinstance(inputs, str): + inputs = [inputs] + if isinstance(outputs, str): + outputs = [outputs] + return inputs, outputs + + @staticmethod + def _check_backend(backend): + backend = backend.upper() + all_backends = ["TF", "TORCH", "ONNX"] + if backend in all_backends: + return backend + else: + raise ValueError( + f"Backend type {backend} unsupported. Options are {all_backends}") + + @staticmethod + def _check_filepath(file): + file_path = Path(file).resolve() + if not file_path.is_file(): + raise FileNotFoundError(file_path) + return file_path + + @staticmethod + def _check_device(device): + device = device.upper() + if not device.startswith("CPU") and not device.startswith("GPU"): + raise ValueError("Device argument must start with either CPU or GPU") + return device + +class DBScript(DBObject): + + def __init__(self, + name, + script=None, + script_path=None, + device="CPU", + devices_per_node=1 + ): + """TorchScript code represenation + + Device selection is either "GPU" or "CPU". If many devices are + present, a number can be passed for specification e.g. "GPU:1". + + Setting ``devices_per_node=N``, with N greater than one will result + in the model being stored in the first N devices of type ``device``. + + One of either script (in memory representation) or script_path (file) + must be provided + + :param name: key to store script under + :type name: str + :param script: TorchScript code + :type script: str, optional + :param script_path: path to TorchScript code, defaults to None + :type script_path: str, optional + :param device: device for script execution, defaults to "CPU" + :type device: str, optional + """ + super().__init__(name, script, script_path, device, devices_per_node) + if not script and not script_path: + raise ValueError("Either script or script_path must be provided") + + @property + def script(self): + return self.func + +class DBModel(DBObject): + def __init__(self, + name, + backend, + model=None, + model_file=None, + device="CPU", + devices_per_node=1, + batch_size=0, + min_batch_size=0, + min_batch_timeout=0, + tag="", + inputs=None, + outputs=None): + """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime + + One of either model (in memory representation) or model_path (file) + must be provided + + :param name: key to store model under + :type name: str + :param model: model in memory + :type model: str, optional + :param model_file: serialized model + :type model_file: file path to model, optional + :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) + :type backend: str + :param device: name of device for execution, defaults to "CPU" + :type device: str, optional + :param batch_size: batch size for execution, defaults to 0 + :type batch_size: int, optional + :param min_batch_size: minimum batch size for model execution, defaults to 0 + :type min_batch_size: int, optional + :param tag: additional tag for model information, defaults to "" + :type tag: str, optional + :param inputs: model inputs (TF only), defaults to None + :type inputs: list[str], optional + :param outputs: model outupts (TF only), defaults to None + :type outputs: list[str], optional + """ + super().__init__(name, model, model_file, device, devices_per_node) + self.backend = self._check_backend(backend) + if not model and not model_file: + raise ValueError("Either model or model_file must be provided") + self.batch_size = batch_size + self.min_batch_size = min_batch_size + self.min_batch_timeout = min_batch_timeout + self.tag = tag + self.inputs, self.outputs = self._check_tensor_args(inputs, outputs) + + @property + def model(self): + return self.func + diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 5b467559a..7d7cc62fa 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -29,6 +29,7 @@ from ..error import EntityExistsError, SSUnsupportedError from .entity import SmartSimEntity from .files import EntityFiles +from .dbobject import DBScript, DBModel class Model(SmartSimEntity): @@ -54,6 +55,8 @@ def __init__(self, name, params, path, run_settings, params_as_args=None): self.params_as_args = params_as_args self.incoming_entities = [] self._key_prefixing_enabled = False + self._db_models = [] + self._db_scripts = [] self.files = None @property @@ -197,7 +200,6 @@ def colocate_db(self, ]) self.run_settings.colocated_db_settings = colo_db_config - def params_to_args(self): """Convert parameters to command line arguments and update run settings.""" for param in self.params_as_args: @@ -213,6 +215,99 @@ def params_to_args(self): ) self.run_settings.add_exe_args(cat_arg_and_value(param, self.params[param])) + def add_ml_model(self, + name, + backend, + model=None, + model_path=None, + device="CPU", + devices_per_node=1, + batch_size=0, + min_batch_size=0, + tag="", + inputs=None, + outputs=None): + """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime + + Each ML Model added will be loaded into an + orchestrator (converged or not) prior to the execution + of this Model instance + + One of either model (in memory representation) or model_path (file) + must be provided + + :param name: key to store model under + :type name: str + :param model: model in memory + :type model: str, optional # TODO figure out what to type hint this as + :param model_path: serialized model + :type model_path: file path to model + :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) + :type backend: str + :param device: name of device for execution, defaults to "CPU" + :type device: str, optional + :param batch_size: batch size for execution, defaults to 0 + :type batch_size: int, optional + :param min_batch_size: minimum batch size for model execution, defaults to 0 + :type min_batch_size: int, optional + :param tag: additional tag for model information, defaults to "" + :type tag: str, optional + :param inputs: model inputs (TF only), defaults to None + :type inputs: list[str], optional + :param outputs: model outupts (TF only), defaults to None + :type outputs: list[str], optional + """ + db_model = DBModel( + name, + backend, + model, + model_path, + device, + devices_per_node, + batch_size, + min_batch_size, + tag, + inputs, + outputs + ) + self._db_models.append(db_model) + + def add_script(self, name, script=None, script_path=None, device="CPU", devices_per_node=1): + """TorchScript to launch with this Model instance + + Each script added to the model will be loaded into an + orchestrator (converged or not) prior to the execution + of this Model instance + + Device selection is either "GPU" or "CPU". If many devices are + present, a number can be passed for specification e.g. "GPU:1". + + Setting ``devices_per_node=N``, with N greater than one will result + in the model being stored in the first N devices of type ``device``. + + One of either script (in memory representation) or script_path (file) + must be provided + + :param name: key to store script under + :type name: str + :param script: TorchScript code + :type script: str, optional + :param script_path: path to TorchScript code + :type script_path: str, optional + :param device: device for script execution, defaults to "CPU" + :type device: str, optional + :param devices_per_node: number of devices on each host + :type devices_per_node: int + """ + db_script = DBScript( + name, + script, + script_path, + device, + devices_per_node + ) + self._db_scripts.append(db_script) + def __eq__(self, other): if self.name == other.name: return True @@ -223,3 +318,5 @@ def __str__(self): # pragma: no cover entity_str += "Type: " + self.type + "\n" entity_str += str(self.run_settings) return entity_str + + From ca9c9db62286a99846ff162e7bd48368871db3da Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 5 Apr 2022 11:53:26 -0500 Subject: [PATCH 02/15] Working add_script, add_function, add_model --- smartsim/_core/control/controller.py | 141 +++++++------- smartsim/_core/entrypoints/colocated.py | 124 ++++++++++++- smartsim/_core/launcher/colocated.py | 74 +++++++- smartsim/entity/dbobject.py | 39 ++++ smartsim/entity/model.py | 79 ++++++-- smartsim/ml/tf/utils.py | 2 + tests/backends/test_dbmodel.py | 172 ++++++++++++++++++ tests/backends/test_dbscript.py | 136 ++++++++++++++ tests/test_configs/run_dbmodel_smartredis.py | 26 +++ tests/test_configs/run_dbscript_smartredis.py | 31 ++++ tests/test_configs/torchscript.py | 4 + 11 files changed, 741 insertions(+), 87 deletions(-) create mode 100644 tests/backends/test_dbmodel.py create mode 100644 tests/backends/test_dbscript.py create mode 100644 tests/test_configs/run_dbmodel_smartredis.py create mode 100644 tests/test_configs/run_dbscript_smartredis.py create mode 100644 tests/test_configs/torchscript.py diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index a29a8fa8b..505027186 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -41,6 +41,7 @@ from .jobmanager import JobManager from smartredis import Client +from smartredis.error import RedisConnectionError, RedisReplyError logger = get_logger(__name__) @@ -80,8 +81,8 @@ def start(self, manifest, block=True): if not self._jobs.actively_monitoring: self._jobs.start() - if self.orchestrator_active(): - self.set_dbobjects(manifest) + if self.orchestrator_active: + self._set_dbobjects(manifest) except KeyboardInterrupt: self._jobs.signal_interrupt() @@ -610,89 +611,101 @@ def _enumerate_devices(self, dbobject: DBObject): """ devices = [] if dbobject.device in ["CPU", "GPU"] and dbobject.devices_per_node > 1: - for device_num in dbobject.devices_per_node: + for device_num in range(dbobject.devices_per_node): devices.append(f"{dbobject.device}:{str(device_num)}") else: devices = [dbobject.device] + return devices - def set_ml_model(self, db_model: DBModel, address, cluster=False): - devices = self._enumerate_devices(db_model) - - redis_cli = (get_install_path / "_core/bin/redis-cli").resolve() - if not redis_cli.is_file(): - raise FileNotFoundError("Could not find redis-cli") - client = Client(address=address, cluster=cluster) + def _set_ml_model(self, db_model, address, cluster=False): + devices = self._enumerate_devices(db_model) + try: + client = Client(address=address, cluster=cluster) + except RedisConnectionError as error: + logger.error("Could not connect to orchestrator") + raise error for device in devices: - if db_model.is_file: - client.set_model_from_file( - key=db_model.name, - model=db_model.model, - backend=db_model.backend, - device=device, - batch_size=db_model.batch_size, - min_batch_size=db_model.min_batch_size, - tag=db_model.tag, - inputs=db_model.inputs, - outputs=db_model.outputs - ) - else: - client.set_model( - key=db_model.name, - model=db_model.model, - backend=db_model.backend, - device=device, - batch_size=db_model.batch_size, - min_batch_size=db_model.min_batch_size, - tag=db_model.tag, - inputs=db_model.inputs, - outputs=db_model.outputs - ) - # TODO some more error handlings here + try: + if db_model.is_file: + client.set_model_from_file( + name=db_model.name, + model_file=str(db_model.file), + backend=db_model.backend, + device=device, + batch_size=db_model.batch_size, + min_batch_size=db_model.min_batch_size, + tag=db_model.tag, + inputs=db_model.inputs, + outputs=db_model.outputs + ) + else: + client.set_model( + name=db_model.name, + model=db_model.model, + backend=db_model.backend, + device=device, + batch_size=db_model.batch_size, + min_batch_size=db_model.min_batch_size, + tag=db_model.tag, + inputs=db_model.inputs, + outputs=db_model.outputs + ) + except RedisReplyError as error: + logger.error("Error while setting model on orchestrator.") + raise error - def set_script(self, db_script:DBScript, address, cluster=False): + def _set_script(self, db_script, address, cluster=False): devices = self._enumerate_devices(db_script) - # TODO some error handling here - client = Client(address=address, cluster=cluster) + try: + client = Client(address=address, cluster=cluster) + except RedisConnectionError as error: + logger.error("Could not connect to orchestrator") + raise error for device in devices: - if db_script.is_file: - client.set_script_from_file( - key=db_script.name, - script_file=db_script.file, - device=device - ) - else: - client.set_script( - key=db_script.name, - script=db_script.script, - device=device - ) - # TODO some more error handlings here + try: + if db_script.is_file: + client.set_script_from_file( + name=db_script.name, + file=str(db_script.file), + device=device + ) + else: + if isinstance(db_script.script, str): + client.set_script( + name=db_script.name, + script=db_script.script, + device=device + ) + else: + client.set_function( + name=db_script.name, + function=db_script.script, + device=device + ) + + except RedisReplyError as error: + logger.error("Error while setting model on orchestrator.") + raise error - def set_dbobjects(self, manifest): + def _set_dbobjects(self, manifest): db_addresses = self._jobs.get_db_host_addresses() cluster = len(db_addresses) > 1 address = db_addresses[0] for model in manifest.models: for db_model in model._db_models: - self.set_ml_model(db_model, address, cluster) + self._set_ml_model(db_model, address, cluster) for db_script in model._db_scripts: - self.set_script(db_script, address, cluster) - - # This allows users to specify per-ensemble and - # per-entity models - for ensemble in manifest.ensembles + manifest.ray_clusters: - for db_model in ensemble._db_models: - self.set_ml_model(db_model, address, cluster) - for db_script in ensemble._db_scripts: - self.set_script(db_script, address, cluster) + self._set_script(db_script, address, cluster) + + for ensemble in manifest.ensembles: for entity in ensemble: for db_model in entity._db_models: - self.set_ml_model(db_model, address, cluster) + self._set_ml_model(db_model, address, cluster) for db_script in entity._db_scripts: - self.set_script(db_script, address, cluster) \ No newline at end of file + self._set_script(db_script, address, cluster) \ No newline at end of file diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index eaebeb3d7..484050ce3 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -37,6 +37,8 @@ from pathlib import Path from subprocess import PIPE, STDOUT +from smartredis import Client +from smartredis.error import RedisConnectionError from smartsim._core.utils.network import current_ip from smartsim.error import SSInternalError from smartsim.log import get_logger @@ -55,8 +57,107 @@ def handle_signal(signo, frame): cleanup() +def launch_db_model(client: Client, db_model: List[str]): + """Parse options to launch model on local cluster -def main(network_interface: str, db_cpus: int, command: List[str]): + :param client: SmartRedis client connected to local DB + :type client: Client + :param db_model: List of arguments defining the model + :type db_model: List[str] + :return: Name of model + :rtype: str + """ + parser = argparse.ArgumentParser("Set ML model on DB") + parser.add_argument("--name", type=str) + parser.add_argument("--file", type=str) + parser.add_argument("--backend", type=str) + parser.add_argument("--device", type=str) + parser.add_argument("--devices_per_node", type=int) + parser.add_argument("--batch_size", type=int, default=0) + parser.add_argument("--min_batch_size", type=int, default=0) + parser.add_argument("--tag", type=str, default="") + parser.add_argument("--inputs", nargs="+", default=None) + parser.add_argument("--outputs", nargs="+", default=None) + + # Unused if we use SmartRedis + parser.add_argument("--min_batch_timeout", type=int, default=None) + args = parser.parse_args(db_model) + + if args.inputs: + inputs = list(args.inputs) + if args.outputs: + outputs = list(args.outputs) + + if args.devices_per_node == 1: + client.set_model_from_file(args.name, + args.file, + args.backend, + args.device, + args.batch_size, + args.min_batch_size, + args.tag, + inputs, + outputs) + else: + for device_num in range(args.devices_per_node): + client.set_model_from_file(args.name, + args.file, + args.backend, + args.device+f":{device_num}", + args.batch_size, + args.min_batch_size, + args.tag, + inputs, + outputs) + + return args.name + +def launch_db_script(client: Client, db_script: List[str]): + """Parse options to launch script on local cluster + + :param client: SmartRedis client connected to local DB + :type client: Client + :param db_model: List of arguments defining the script + :type db_model: List[str] + :return: Name of model + :rtype: str + """ + parser = argparse.ArgumentParser("Set script on DB") + parser.add_argument("--name", type=str) + parser.add_argument("--func", type=str) + parser.add_argument("--file", type=str) + parser.add_argument("--backend", type=str) + parser.add_argument("--device", type=str) + parser.add_argument("--devices_per_node", type=int) + args = parser.parse_args(db_script) + if args.func: + func = args.func.replace("\\n", "\n") + + if args.devices_per_node == 1: + client.set_script(args.name, + func, + args.device) + else: + for device_num in range(args.devices_per_node): + client.set_script(args.name, + func, + args.device+f":{device_num}") + elif args.file: + if args.devices_per_node == 1: + client.set_script_from_file(args.name, + args.file, + args.device) + else: + for device_num in range(args.devices_per_node): + client.set_script_from_file(args.name, + args.file, + args.device+f":{device_num}") + + + return args.name + + +def main(network_interface: str, db_cpus: int, command: List[str], db_models: List[List[str]], db_scripts: List[List[str]]): global DBPID try: @@ -102,6 +203,23 @@ def main(network_interface: str, db_cpus: int, command: List[str]): f"\tCommand: {' '.join(cmd)}\n\n" ))) + if db_models or db_scripts: + try: + client = Client(cluster=False) + for i, db_model in enumerate(db_models): + logger.debug("Uploading model") + model_name = launch_db_model(client, db_model) + logger.debug(f"Added model {model_name} ({i+1}/{len(db_models)})") + for i, db_script in enumerate(db_scripts): + logger.debug("Uploading script") + script_name = launch_db_script(client, db_script) + logger.debug(f"Added script {script_name} ({i+1}/{len(db_scripts)})") + # Make sure we don't keep this around + del client + except RedisConnectionError: + raise SSInternalError("Failed to set model or script, could not connect to database") + + for line in iter(p.stdout.readline, b""): print(line.decode("utf-8").rstrip(), flush=True) @@ -144,6 +262,8 @@ def cleanup(): parser.add_argument("+lockfile", type=str, help="Filename to create for single proc per host") parser.add_argument("+db_cpus", type=int, default=2, help="Number of CPUs to use for DB") parser.add_argument("+command", nargs="+", help="Command to run") + parser.add_argument("+db_model", nargs="+", action="append", default=[], help="Model to set on DB") + parser.add_argument("+db_script", nargs="+", action="append", default=[], help="Script to set on DB") args = parser.parse_args() tmp_lockfile = Path(tempfile.gettempdir()) / args.lockfile @@ -160,7 +280,7 @@ def cleanup(): for sig in SIGNALS: signal.signal(sig, handle_signal) - main(args.ifname, args.db_cpus, args.command) + main(args.ifname, args.db_cpus, args.command, args.db_model, args.db_script) # gracefully exit the processes in the distributed application that # we do not want to have start a colocated process. Only one process diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index b7ba7019f..3556ce5a9 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -25,7 +25,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys + from ..config import CONFIG +from ...error import SSUnsupportedError from ..utils.helpers import create_lockfile_name @@ -66,9 +68,6 @@ def write_colocated_launch_script(file_name, db_log, colocated_settings): f.write(f"{colocated_cmd}\n") f.write(f"DBPID=$!\n\n") - if colocated_settings["db_models"]: - pass - if colocated_settings["limit_app_cpus"]: cpus = colocated_settings["cpus"] f.write( @@ -154,9 +153,78 @@ def _build_colocated_wrapper_cmd(port=6780, f"--{db_arg}", value ]) + + db_models = kwargs.get("db_models", None) + if db_models: + db_model_cmd = _build_db_model_cmd(db_models) + db_cmd.extend(db_model_cmd) + + db_scripts = kwargs.get("db_scripts", None) + if db_scripts: + db_script_cmd = _build_db_script_cmd(db_scripts) + db_cmd.extend(db_script_cmd) + # run colocated db in the background db_cmd.append("&") cmd.extend(db_cmd) return " ".join(cmd) + +def _build_db_model_cmd(db_models): + cmd = [] + for db_model in db_models: + cmd.append("+db_model") + cmd.append(f"--name={db_model.name}") + if db_model.file: + cmd.append(f"--file={db_model.file}") + else: + err_msg = "ML model can not be set from memory for colocated databases.\n" + err_msg += "Please store the ML model in binary format " + err_msg += "and add it to the SmartSim Model as file." + raise SSUnsupportedError(err_msg) + cmd.append(f"--backend={db_model.backend}") + cmd.append(f"--device={db_model.device}") + cmd.append(f"--devices_per_node={db_model.devices_per_node}") + if db_model.batch_size: + cmd.append(f"--batch_size={db_model.batch_size}") + if db_model.min_batch_size: + cmd.append(f"--min_batch_size={db_model.min_batch_size}") + if db_model.min_batch_timeout: + cmd.append(f"--min_batch_timeout={db_model.min_batch_timeout}") + if db_model.tag: + cmd.append(f"--tag={db_model.tag}") + if db_model.inputs: + cmd.append("--inputs="+",".join(db_model.inputs)) + if db_model.outputs: + cmd.append("--outputs="+",".join(db_model.outputs)) + + return cmd + + + + +def _build_db_script_cmd(db_scripts): + cmd = [] + for db_script in db_scripts: + cmd.append("+db_script") + cmd.append(f"--name={db_script.name}") + if db_script.func: + if not isinstance(db_script.func, str): + err_msg = "Functions can not be set from memory for colocated databases.\n" + err_msg += "Please convert the function to a string or store it as a text file " + err_msg += "and add it to the SmartSim Model with add_script." + raise SSUnsupportedError(err_msg) + + sanitized_func = db_script.func.replace("\n", "\\n") + if not (sanitized_func.startswith("'") and sanitized_func.endswith("'") + or (sanitized_func.startswith('"') and sanitized_func.endswith('"'))): + sanitized_func = "\"" + sanitized_func + "\"" + cmd.append(f"--func={sanitized_func}") + elif db_script.file: + cmd.append(f"--file={db_script.file}") + cmd.append(f"--device={db_script.device}") + cmd.append(f"--devices_per_node={db_script.devices_per_node}") + + return cmd + \ No newline at end of file diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index a493bad9a..3c40aa6ea 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -8,9 +8,13 @@ def __init__(self, name, func, file_path, device, devices_per_node): self.func = func if file_path: self.file = self._check_filepath(file_path) + else: + # Need to have this explicitly to check on it + self.file = None self.device = self._check_device(device) self.devices_per_node = devices_per_node + @property def is_file(self): if self.func: return False @@ -87,6 +91,17 @@ def __init__(self, def script(self): return self.func + def __str__(self): + desc_str = "Name: " + self.name + "\n" + if self.func: + desc_str += "Func: " + self.func + "\n" + if self.file: + desc_str += "File path: " + str(self.file) + "\n" + devices_str = self.device + ("s per node\n" if self.devices_per_node > 1 else " per node\n") + desc_str += "Devices: " + str(self.devices_per_node) + " " + devices_str + return desc_str + + class DBModel(DBObject): def __init__(self, name, @@ -120,6 +135,8 @@ def __init__(self, :type batch_size: int, optional :param min_batch_size: minimum batch size for model execution, defaults to 0 :type min_batch_size: int, optional + :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 + :type min_batch_timeout: int, optional :param tag: additional tag for model information, defaults to "" :type tag: str, optional :param inputs: model inputs (TF only), defaults to None @@ -141,3 +158,25 @@ def __init__(self, def model(self): return self.func + def __str__(self): + desc_str = "Name: " + self.name + "\n" + if self.model: + desc_str += "Model stored in memory" + if self.file: + desc_str += "File path: " + str(self.file) + "\n" + devices_str = self.device + ("s per node\n" if self.devices_per_node > 1 else " per node\n") + desc_str += "Devices: " + str(self.devices_per_node) + " " + devices_str + desc_str += "Backend: " + str(self.backend) + "\n" + if self.batch_size: + desc_str += "Batch size: " + str(self.batch_size) + "\n" + if self.min_batch_size: + desc_str += "Min batch size: " + str(self.min_batch_size) + "\n" + if self.min_batch_timeout: + desc_str += "Min batch time out: " + str(self.min_batch_timeout) + "\n" + if self.tag: + desc_str += "Tag: " + self.tag + "\n" + if self.inputs: + desc_str += "Inputs: " + str(self.inputs) + "\n" + if self.outputs: + desc_str += "Outputs: " + str(self.outputs) + "\n" + return desc_str diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 7d7cc62fa..8c52d3528 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -198,6 +198,10 @@ def colocate_db(self, colo_db_config["extra_db_args"] = dict([ (k,str(v)) for k,v in kwargs.items() if k not in colo_db_config["rai_args"] ]) + + colo_db_config["db_models"] = self._db_models + colo_db_config["db_scripts"] = self._db_scripts + self.run_settings.colocated_db_settings = colo_db_config def params_to_args(self): @@ -258,17 +262,17 @@ def add_ml_model(self, :type outputs: list[str], optional """ db_model = DBModel( - name, - backend, - model, - model_path, - device, - devices_per_node, - batch_size, - min_batch_size, - tag, - inputs, - outputs + name=name, + backend=backend, + model=model, + model_file=model_path, + device=device, + devices_per_node=devices_per_node, + batch_size=batch_size, + min_batch_size=min_batch_size, + tag=tag, + inputs=inputs, + outputs=outputs ) self._db_models.append(db_model) @@ -285,7 +289,7 @@ def add_script(self, name, script=None, script_path=None, device="CPU", devices_ Setting ``devices_per_node=N``, with N greater than one will result in the model being stored in the first N devices of type ``device``. - One of either script (in memory representation) or script_path (file) + One of either script (in memory string representation) or script_path (file) must be provided :param name: key to store script under @@ -300,11 +304,46 @@ def add_script(self, name, script=None, script_path=None, device="CPU", devices_ :type devices_per_node: int """ db_script = DBScript( - name, - script, - script_path, - device, - devices_per_node + name=name, + script=script, + script_path=script_path, + device=device, + devices_per_node=devices_per_node + ) + self._db_scripts.append(db_script) + + + def add_function(self, name, function=None, device="CPU", devices_per_node=1): + """TorchScript function to launch with this Model instance + + Each script function to the model will be loaded into a + non-converged orchestrator prior to the execution + of this Model instance. + + For converged orchestrators, the :meth:`add_script` method should be used. + + Device selection is either "GPU" or "CPU". If many devices are + present, a number can be passed for specification e.g. "GPU:1". + + Setting ``devices_per_node=N``, with N greater than one will result + in the model being stored in the first N devices of type ``device``. + + :param name: key to store function under + :type name: str + :param script: TorchScript code + :type script: str, optional + :param script_path: path to TorchScript code + :type script_path: str, optional + :param device: device for script execution, defaults to "CPU" + :type device: str, optional + :param devices_per_node: number of devices on each host + :type devices_per_node: int + """ + db_script = DBScript( + name=name, + script=function, + device=device, + devices_per_node=devices_per_node ) self._db_scripts.append(db_script) @@ -316,7 +355,11 @@ def __eq__(self, other): def __str__(self): # pragma: no cover entity_str = "Name: " + self.name + "\n" entity_str += "Type: " + self.type + "\n" - entity_str += str(self.run_settings) + entity_str += str(self.run_settings) + "\n" + if self._db_models: + entity_str += "DB Models: \n" + str(len(self._db_models)) + "\n" + if self._db_scripts: + entity_str += "DB Scripts: \n" + str(len(self._db_scripts)) + "\n" return entity_str diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index a43cc8b8d..496c28d9f 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -64,6 +64,8 @@ def serialize_model(model): :param model: TensorFlow or Keras model :type model: tf.Module + :return: serialized model, model input layer names, model output layer names + :rtype: str, list[str], list[str] """ full_model = tf.function(lambda x: model(x)) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py new file mode 100644 index 000000000..de6ca8524 --- /dev/null +++ b/tests/backends/test_dbmodel.py @@ -0,0 +1,172 @@ +import sys +import pytest + +from smartsim import Experiment, status +import smartsim +from smartsim._core.utils import installed_redisai_backends +from smartsim.error.errors import SSUnsupportedError + +should_run = True + +try: + import tensorflow.keras as keras + from tensorflow.keras.layers import Conv2D, Input +except ImportError: + should_run = False + +should_run &= "tensorflow" in installed_redisai_backends() + +class Net(keras.Model): + def __init__(self): + super(Net, self).__init__(name="cnn") + self.conv = Conv2D(1, 3, 1) + + def call(self, x): + y = self.conv(x) + return y + + +def save_tf_cnn(path, file_name): + """Create a Keras CNN for testing purposes + + """ + from smartsim.ml.tf import freeze_model + n = Net() + input_shape = (3,3,1) + n.build(input_shape=(None,*input_shape)) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return freeze_model(model, path, file_name) + + +def create_tf_cnn(): + """Create a Keras CNN for testing purposes + + """ + from smartsim.ml.tf import serialize_model + n = Net() + input_shape = (3,3,1) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return serialize_model(model) + + +@pytest.mark.skipif(not should_run, reason="Test needs TF to run") +def test_colocated_db_model(fileutils): + """Test DB Models on colocated DB""" + + exp_name = "test-colocated-db-model" + exp = Experiment(exp_name, launcher="local") + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + colo_model.colocate_db( + port=6780, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") + model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + + colo_model.add_ml_model("cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs) + colo_model.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + + # Assert we have added both models + assert(len(colo_model._db_models) == 2) + + exp.start(colo_model, block=True) + statuses = exp.get_status(colo_model) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + +@pytest.mark.skipif(not should_run, reason="Test needs TF to run") +def test_db_model(fileutils): + """Test DB Models on remote DB""" + + exp_name = "test-db-model" + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + + exp = Experiment(exp_name, exp_path=test_dir, launcher="local") + # create colocated model + run_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model.set_path(test_dir) + + db = exp.create_database(port=6780, interface="lo") + + model, inputs, outputs = create_tf_cnn() + model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + + smartsim_model.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) + smartsim_model.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + + for db_model in smartsim_model._db_models: + print(db_model) + + # Assert we have added both models + assert(len(smartsim_model._db_models) == 2) + + exp.start(db, smartsim_model, block=True) + statuses = exp.get_status(smartsim_model) + exp.stop(db) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + +@pytest.mark.skipif(not should_run or not "tensorflow" in installed_redisai_backends(), reason="Test needs TF to run") +def test_colocated_db_model_error(fileutils): + """Test error when colocated db model has no file.""" + + exp_name = "test-colocated-db-model-error" + exp = Experiment(exp_name, launcher="local") + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + colo_model.colocate_db( + port=6780, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + model, inputs, outputs = create_tf_cnn() + + colo_model.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) + + with pytest.raises(SSUnsupportedError): + exp.start(colo_model, block=True) + + diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py new file mode 100644 index 000000000..b9cc827d1 --- /dev/null +++ b/tests/backends/test_dbscript.py @@ -0,0 +1,136 @@ +import sys +import os.path as osp +import pytest + +from smartsim import Experiment, status +from smartsim._core.utils import installed_redisai_backends +from smartsim.error.errors import SSUnsupportedError + +should_run = True + +try: + import torch +except ImportError: + should_run = False + +should_run &= "torch" in installed_redisai_backends() + +def timestwo(x): + return 2*x + +@pytest.mark.skipif(not should_run, reason="Test needs Torch to run") +def test_colocated_db_script(fileutils): + """Test DB Scripts on colocated DB""" + + exp_name = "test-colocated-db-script" + exp = Experiment(exp_name, launcher="local") + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + torch_script = fileutils.get_test_conf_path("torchscript.py") + + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + colo_model.colocate_db( + port=6780, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + colo_model.add_script("test_script1", script_path=torch_script, device="CPU") + colo_model.add_script("test_script2", script=torch_script_str, device="CPU") + + # Assert we have added both models + assert(len(colo_model._db_scripts) == 2) + + for db_script in colo_model._db_scripts: + print(db_script) + + exp.start(colo_model, block=True) + statuses = exp.get_status(colo_model) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + +@pytest.mark.skipif(not should_run, reason="Test needs Torch to run") +def test_db_script(fileutils): + """Test DB scripts on remote DB""" + + exp_name = "test-db-script" + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + torch_script = fileutils.get_test_conf_path("torchscript.py") + + exp = Experiment(exp_name, exp_path=test_dir, launcher="local") + # create colocated model + run_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model.set_path(test_dir) + + db = exp.create_database(port=6780, interface="lo") + + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + smartsim_model.add_script("test_script1", script_path=torch_script, device="CPU") + smartsim_model.add_script("test_script2", script=torch_script_str, device="CPU") + smartsim_model.add_function("test_func", function=timestwo, device="CPU") + + # Assert we have added both models + assert(len(smartsim_model._db_scripts) == 3) + + exp.start(db, smartsim_model, block=True) + statuses = exp.get_status(smartsim_model) + exp.stop(db) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + +@pytest.mark.skipif(not should_run, reason="Test needs Torch to run") +def test_db_script_error(fileutils): + """Test DB Scripts error when setting a function on colocated DB""" + + exp_name = "test-colocated-db-script" + exp = Experiment(exp_name, launcher="local") + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + colo_model.colocate_db( + port=6780, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + colo_model.add_function("test_func", function=timestwo, device="CPU") + + # Assert we have added both models + assert(len(colo_model._db_scripts) == 1) + + with pytest.raises(SSUnsupportedError): + exp.start(colo_model, block=True) + \ No newline at end of file diff --git a/tests/test_configs/run_dbmodel_smartredis.py b/tests/test_configs/run_dbmodel_smartredis.py new file mode 100644 index 000000000..e94dd73dd --- /dev/null +++ b/tests/test_configs/run_dbmodel_smartredis.py @@ -0,0 +1,26 @@ +import numpy as np +from smartredis import Client + +def main(): + # address should be set as we are launching through + # SmartSim. + client = Client(cluster=False) + + array = np.ones((1, 3, 3, 1)).astype(np.single) + client.put_tensor("test_array", array) + assert client.poll_model("cnn", 500, 30) + client.run_model("cnn", ["test_array"], ["test_output"]) + returned = client.get_tensor("test_output") + + assert returned.shape == (1, 1, 1, 1) + + array = np.ones((1, 3, 3, 1)).astype(np.single) + assert client.poll_model("cnn2", 500, 30) + client.run_model("cnn2", ["test_array"], ["test_output"]) + returned = client.get_tensor("test_output") + + assert returned.shape == (1, 1, 1, 1) + print(f"Test worked!") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/test_configs/run_dbscript_smartredis.py b/tests/test_configs/run_dbscript_smartredis.py new file mode 100644 index 000000000..e88a9540c --- /dev/null +++ b/tests/test_configs/run_dbscript_smartredis.py @@ -0,0 +1,31 @@ +import numpy as np +from smartredis import Client +from pytest import approx + +def main(): + # address should be set as we are launching through + # SmartSim. + client = Client(cluster=False) + + array = np.ones((1, 3, 3, 1)).astype(np.single) + client.put_tensor("test_array", array) + assert client.poll_model("test_script1", 500, 30) + client.run_script("test_script1", "average", ["test_array"], ["test_output"]) + returned = client.get_tensor("test_output") + assert returned == approx(np.mean(array)) + + assert client.poll_model("test_script2", 500, 30) + client.run_script("test_script2", "negate", ["test_array"], ["test_output"]) + returned = client.get_tensor("test_output") + + assert returned == approx(-array) + + if client.model_exists("test_func"): + client.run_script("test_func", "timestwo", ["test_array"], ["test_output"]) + returned = client.get_tensor("test_output") + assert returned == approx(2*array) + + print(f"Test worked!") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/test_configs/torchscript.py b/tests/test_configs/torchscript.py new file mode 100644 index 000000000..ca7ccee71 --- /dev/null +++ b/tests/test_configs/torchscript.py @@ -0,0 +1,4 @@ +# import torch + +def average(x): + return torch.tensor(torch.mean(x)).unsqueeze(0) From 9db1c96f9dc354a947a5cfb38fecaf05654375c1 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 1 Apr 2022 16:14:40 -0500 Subject: [PATCH 03/15] First commit --- smartsim/_core/control/controller.py | 8 +- smartsim/_core/launcher/colocated.py | 8 +- smartsim/entity/__init__.py | 1 + smartsim/entity/dbobject.py | 143 +++++++++++++++++++++++++++ smartsim/entity/model.py | 99 ++++++++++++++++++- 5 files changed, 254 insertions(+), 5 deletions(-) create mode 100644 smartsim/entity/dbobject.py diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index d3bf2ecfb..1702fc432 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -30,8 +30,9 @@ import threading import time +from ..._core._cli.utils import get_install_path from ...database import Orchestrator -from ...entity import DBNode, EntityList, SmartSimEntity +from ...entity import DBNode, DBModel, DBObject, DBScript, EntityList, SmartSimEntity from ...error import LauncherError, SmartSimError, SSInternalError, SSUnsupportedError from ...log import get_logger from ...status import STATUS_RUNNING, TERMINAL_STATUSES @@ -40,6 +41,9 @@ from ..utils import check_cluster_status, create_cluster from .jobmanager import JobManager +from smartredis import Client + + logger = get_logger(__name__) # job manager lock @@ -297,7 +301,7 @@ def _launch(self, manifest): batch_step = self._create_batch_job_step(elist) steps.append((batch_step, elist)) else: - # if ensemble is to be run as seperate job steps, aka not in a batch + # if ensemble is to be run as separate job steps, aka not in a batch job_steps = [(self._create_job_step(e), e) for e in elist.entities] steps.extend(job_steps) diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index 223c0943f..3243dce4b 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -65,6 +65,10 @@ def write_colocated_launch_script(file_name, db_log, colocated_settings): f.write(f"{colocated_cmd}\n") f.write(f"DBPID=$!\n\n") + + if colocated_settings["db_models"]: + pass + if colocated_settings["limit_app_cpus"]: cpus = colocated_settings["cpus"] f.write( @@ -129,7 +133,7 @@ def _build_colocated_wrapper_cmd(port=6780, # add extra redisAI configurations for arg, value in rai_args.items(): if value: - # RAI wants arguments for inference in all capps + # RAI wants arguments for inference in all caps # ex. THREADS_PER_QUEUE=1 db_cmd.append(f"{arg.upper()} {str(value)}") @@ -142,7 +146,7 @@ def _build_colocated_wrapper_cmd(port=6780, ]) for db_arg, value in extra_db_args.items(): # replace "_" with "-" in the db_arg because we use kwargs - # for the extra configurations and Python doesn't allow a hypon + # for the extra configurations and Python doesn't allow a hyphen # in a variable name. All redis and KeyDB configuration options # use hyphens in their names. db_arg = db_arg.replace("_", "-") diff --git a/smartsim/entity/__init__.py b/smartsim/entity/__init__.py index 0c8c54c48..ab661a7f1 100644 --- a/smartsim/entity/__init__.py +++ b/smartsim/entity/__init__.py @@ -3,3 +3,4 @@ from .entity import SmartSimEntity from .entityList import EntityList from .model import Model +from .dbobject import DBScript, DBModel, DBObject \ No newline at end of file diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py new file mode 100644 index 000000000..a493bad9a --- /dev/null +++ b/smartsim/entity/dbobject.py @@ -0,0 +1,143 @@ +from pathlib import Path +from .._core.utils.helpers import init_default + + +class DBObject: + def __init__(self, name, func, file_path, device, devices_per_node): + self.name = name + self.func = func + if file_path: + self.file = self._check_filepath(file_path) + self.device = self._check_device(device) + self.devices_per_node = devices_per_node + + def is_file(self): + if self.func: + return False + return True + + @staticmethod + def _check_tensor_args(inputs, outputs): + inputs = init_default([], inputs, (list, str)) + outputs = init_default([], outputs, (list, str)) + if isinstance(inputs, str): + inputs = [inputs] + if isinstance(outputs, str): + outputs = [outputs] + return inputs, outputs + + @staticmethod + def _check_backend(backend): + backend = backend.upper() + all_backends = ["TF", "TORCH", "ONNX"] + if backend in all_backends: + return backend + else: + raise ValueError( + f"Backend type {backend} unsupported. Options are {all_backends}") + + @staticmethod + def _check_filepath(file): + file_path = Path(file).resolve() + if not file_path.is_file(): + raise FileNotFoundError(file_path) + return file_path + + @staticmethod + def _check_device(device): + device = device.upper() + if not device.startswith("CPU") and not device.startswith("GPU"): + raise ValueError("Device argument must start with either CPU or GPU") + return device + +class DBScript(DBObject): + + def __init__(self, + name, + script=None, + script_path=None, + device="CPU", + devices_per_node=1 + ): + """TorchScript code represenation + + Device selection is either "GPU" or "CPU". If many devices are + present, a number can be passed for specification e.g. "GPU:1". + + Setting ``devices_per_node=N``, with N greater than one will result + in the model being stored in the first N devices of type ``device``. + + One of either script (in memory representation) or script_path (file) + must be provided + + :param name: key to store script under + :type name: str + :param script: TorchScript code + :type script: str, optional + :param script_path: path to TorchScript code, defaults to None + :type script_path: str, optional + :param device: device for script execution, defaults to "CPU" + :type device: str, optional + """ + super().__init__(name, script, script_path, device, devices_per_node) + if not script and not script_path: + raise ValueError("Either script or script_path must be provided") + + @property + def script(self): + return self.func + +class DBModel(DBObject): + def __init__(self, + name, + backend, + model=None, + model_file=None, + device="CPU", + devices_per_node=1, + batch_size=0, + min_batch_size=0, + min_batch_timeout=0, + tag="", + inputs=None, + outputs=None): + """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime + + One of either model (in memory representation) or model_path (file) + must be provided + + :param name: key to store model under + :type name: str + :param model: model in memory + :type model: str, optional + :param model_file: serialized model + :type model_file: file path to model, optional + :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) + :type backend: str + :param device: name of device for execution, defaults to "CPU" + :type device: str, optional + :param batch_size: batch size for execution, defaults to 0 + :type batch_size: int, optional + :param min_batch_size: minimum batch size for model execution, defaults to 0 + :type min_batch_size: int, optional + :param tag: additional tag for model information, defaults to "" + :type tag: str, optional + :param inputs: model inputs (TF only), defaults to None + :type inputs: list[str], optional + :param outputs: model outupts (TF only), defaults to None + :type outputs: list[str], optional + """ + super().__init__(name, model, model_file, device, devices_per_node) + self.backend = self._check_backend(backend) + if not model and not model_file: + raise ValueError("Either model or model_file must be provided") + self.batch_size = batch_size + self.min_batch_size = min_batch_size + self.min_batch_timeout = min_batch_timeout + self.tag = tag + self.inputs, self.outputs = self._check_tensor_args(inputs, outputs) + + @property + def model(self): + return self.func + diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 5b467559a..7d7cc62fa 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -29,6 +29,7 @@ from ..error import EntityExistsError, SSUnsupportedError from .entity import SmartSimEntity from .files import EntityFiles +from .dbobject import DBScript, DBModel class Model(SmartSimEntity): @@ -54,6 +55,8 @@ def __init__(self, name, params, path, run_settings, params_as_args=None): self.params_as_args = params_as_args self.incoming_entities = [] self._key_prefixing_enabled = False + self._db_models = [] + self._db_scripts = [] self.files = None @property @@ -197,7 +200,6 @@ def colocate_db(self, ]) self.run_settings.colocated_db_settings = colo_db_config - def params_to_args(self): """Convert parameters to command line arguments and update run settings.""" for param in self.params_as_args: @@ -213,6 +215,99 @@ def params_to_args(self): ) self.run_settings.add_exe_args(cat_arg_and_value(param, self.params[param])) + def add_ml_model(self, + name, + backend, + model=None, + model_path=None, + device="CPU", + devices_per_node=1, + batch_size=0, + min_batch_size=0, + tag="", + inputs=None, + outputs=None): + """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime + + Each ML Model added will be loaded into an + orchestrator (converged or not) prior to the execution + of this Model instance + + One of either model (in memory representation) or model_path (file) + must be provided + + :param name: key to store model under + :type name: str + :param model: model in memory + :type model: str, optional # TODO figure out what to type hint this as + :param model_path: serialized model + :type model_path: file path to model + :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) + :type backend: str + :param device: name of device for execution, defaults to "CPU" + :type device: str, optional + :param batch_size: batch size for execution, defaults to 0 + :type batch_size: int, optional + :param min_batch_size: minimum batch size for model execution, defaults to 0 + :type min_batch_size: int, optional + :param tag: additional tag for model information, defaults to "" + :type tag: str, optional + :param inputs: model inputs (TF only), defaults to None + :type inputs: list[str], optional + :param outputs: model outupts (TF only), defaults to None + :type outputs: list[str], optional + """ + db_model = DBModel( + name, + backend, + model, + model_path, + device, + devices_per_node, + batch_size, + min_batch_size, + tag, + inputs, + outputs + ) + self._db_models.append(db_model) + + def add_script(self, name, script=None, script_path=None, device="CPU", devices_per_node=1): + """TorchScript to launch with this Model instance + + Each script added to the model will be loaded into an + orchestrator (converged or not) prior to the execution + of this Model instance + + Device selection is either "GPU" or "CPU". If many devices are + present, a number can be passed for specification e.g. "GPU:1". + + Setting ``devices_per_node=N``, with N greater than one will result + in the model being stored in the first N devices of type ``device``. + + One of either script (in memory representation) or script_path (file) + must be provided + + :param name: key to store script under + :type name: str + :param script: TorchScript code + :type script: str, optional + :param script_path: path to TorchScript code + :type script_path: str, optional + :param device: device for script execution, defaults to "CPU" + :type device: str, optional + :param devices_per_node: number of devices on each host + :type devices_per_node: int + """ + db_script = DBScript( + name, + script, + script_path, + device, + devices_per_node + ) + self._db_scripts.append(db_script) + def __eq__(self, other): if self.name == other.name: return True @@ -223,3 +318,5 @@ def __str__(self): # pragma: no cover entity_str += "Type: " + self.type + "\n" entity_str += str(self.run_settings) return entity_str + + From 75a1f749f13d9d606faf70a3382d1a2f43645a9b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 5 Apr 2022 11:53:26 -0500 Subject: [PATCH 04/15] Working add_script, add_function, add_model --- smartsim/_core/control/controller.py | 1 + smartsim/_core/entrypoints/colocated.py | 124 ++++++++++++- smartsim/_core/launcher/colocated.py | 74 +++++++- smartsim/entity/dbobject.py | 39 ++++ smartsim/entity/model.py | 79 ++++++-- smartsim/ml/tf/utils.py | 2 + tests/backends/test_dbmodel.py | 172 ++++++++++++++++++ tests/backends/test_dbscript.py | 136 ++++++++++++++ tests/test_configs/run_dbmodel_smartredis.py | 26 +++ tests/test_configs/run_dbscript_smartredis.py | 31 ++++ tests/test_configs/torchscript.py | 4 + 11 files changed, 665 insertions(+), 23 deletions(-) create mode 100644 tests/backends/test_dbmodel.py create mode 100644 tests/backends/test_dbscript.py create mode 100644 tests/test_configs/run_dbmodel_smartredis.py create mode 100644 tests/test_configs/run_dbscript_smartredis.py create mode 100644 tests/test_configs/torchscript.py diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 1702fc432..205c825fd 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -42,6 +42,7 @@ from .jobmanager import JobManager from smartredis import Client +from smartredis.error import RedisConnectionError, RedisReplyError logger = get_logger(__name__) diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index eaebeb3d7..484050ce3 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -37,6 +37,8 @@ from pathlib import Path from subprocess import PIPE, STDOUT +from smartredis import Client +from smartredis.error import RedisConnectionError from smartsim._core.utils.network import current_ip from smartsim.error import SSInternalError from smartsim.log import get_logger @@ -55,8 +57,107 @@ def handle_signal(signo, frame): cleanup() +def launch_db_model(client: Client, db_model: List[str]): + """Parse options to launch model on local cluster -def main(network_interface: str, db_cpus: int, command: List[str]): + :param client: SmartRedis client connected to local DB + :type client: Client + :param db_model: List of arguments defining the model + :type db_model: List[str] + :return: Name of model + :rtype: str + """ + parser = argparse.ArgumentParser("Set ML model on DB") + parser.add_argument("--name", type=str) + parser.add_argument("--file", type=str) + parser.add_argument("--backend", type=str) + parser.add_argument("--device", type=str) + parser.add_argument("--devices_per_node", type=int) + parser.add_argument("--batch_size", type=int, default=0) + parser.add_argument("--min_batch_size", type=int, default=0) + parser.add_argument("--tag", type=str, default="") + parser.add_argument("--inputs", nargs="+", default=None) + parser.add_argument("--outputs", nargs="+", default=None) + + # Unused if we use SmartRedis + parser.add_argument("--min_batch_timeout", type=int, default=None) + args = parser.parse_args(db_model) + + if args.inputs: + inputs = list(args.inputs) + if args.outputs: + outputs = list(args.outputs) + + if args.devices_per_node == 1: + client.set_model_from_file(args.name, + args.file, + args.backend, + args.device, + args.batch_size, + args.min_batch_size, + args.tag, + inputs, + outputs) + else: + for device_num in range(args.devices_per_node): + client.set_model_from_file(args.name, + args.file, + args.backend, + args.device+f":{device_num}", + args.batch_size, + args.min_batch_size, + args.tag, + inputs, + outputs) + + return args.name + +def launch_db_script(client: Client, db_script: List[str]): + """Parse options to launch script on local cluster + + :param client: SmartRedis client connected to local DB + :type client: Client + :param db_model: List of arguments defining the script + :type db_model: List[str] + :return: Name of model + :rtype: str + """ + parser = argparse.ArgumentParser("Set script on DB") + parser.add_argument("--name", type=str) + parser.add_argument("--func", type=str) + parser.add_argument("--file", type=str) + parser.add_argument("--backend", type=str) + parser.add_argument("--device", type=str) + parser.add_argument("--devices_per_node", type=int) + args = parser.parse_args(db_script) + if args.func: + func = args.func.replace("\\n", "\n") + + if args.devices_per_node == 1: + client.set_script(args.name, + func, + args.device) + else: + for device_num in range(args.devices_per_node): + client.set_script(args.name, + func, + args.device+f":{device_num}") + elif args.file: + if args.devices_per_node == 1: + client.set_script_from_file(args.name, + args.file, + args.device) + else: + for device_num in range(args.devices_per_node): + client.set_script_from_file(args.name, + args.file, + args.device+f":{device_num}") + + + return args.name + + +def main(network_interface: str, db_cpus: int, command: List[str], db_models: List[List[str]], db_scripts: List[List[str]]): global DBPID try: @@ -102,6 +203,23 @@ def main(network_interface: str, db_cpus: int, command: List[str]): f"\tCommand: {' '.join(cmd)}\n\n" ))) + if db_models or db_scripts: + try: + client = Client(cluster=False) + for i, db_model in enumerate(db_models): + logger.debug("Uploading model") + model_name = launch_db_model(client, db_model) + logger.debug(f"Added model {model_name} ({i+1}/{len(db_models)})") + for i, db_script in enumerate(db_scripts): + logger.debug("Uploading script") + script_name = launch_db_script(client, db_script) + logger.debug(f"Added script {script_name} ({i+1}/{len(db_scripts)})") + # Make sure we don't keep this around + del client + except RedisConnectionError: + raise SSInternalError("Failed to set model or script, could not connect to database") + + for line in iter(p.stdout.readline, b""): print(line.decode("utf-8").rstrip(), flush=True) @@ -144,6 +262,8 @@ def cleanup(): parser.add_argument("+lockfile", type=str, help="Filename to create for single proc per host") parser.add_argument("+db_cpus", type=int, default=2, help="Number of CPUs to use for DB") parser.add_argument("+command", nargs="+", help="Command to run") + parser.add_argument("+db_model", nargs="+", action="append", default=[], help="Model to set on DB") + parser.add_argument("+db_script", nargs="+", action="append", default=[], help="Script to set on DB") args = parser.parse_args() tmp_lockfile = Path(tempfile.gettempdir()) / args.lockfile @@ -160,7 +280,7 @@ def cleanup(): for sig in SIGNALS: signal.signal(sig, handle_signal) - main(args.ifname, args.db_cpus, args.command) + main(args.ifname, args.db_cpus, args.command, args.db_model, args.db_script) # gracefully exit the processes in the distributed application that # we do not want to have start a colocated process. Only one process diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index 3243dce4b..8602d9445 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -25,7 +25,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys + from ..config import CONFIG +from ...error import SSUnsupportedError from ..utils.helpers import create_lockfile_name @@ -66,9 +68,6 @@ def write_colocated_launch_script(file_name, db_log, colocated_settings): f.write(f"{colocated_cmd}\n") f.write(f"DBPID=$!\n\n") - if colocated_settings["db_models"]: - pass - if colocated_settings["limit_app_cpus"]: cpus = colocated_settings["cpus"] f.write( @@ -154,9 +153,78 @@ def _build_colocated_wrapper_cmd(port=6780, f"--{db_arg}", value ]) + + db_models = kwargs.get("db_models", None) + if db_models: + db_model_cmd = _build_db_model_cmd(db_models) + db_cmd.extend(db_model_cmd) + + db_scripts = kwargs.get("db_scripts", None) + if db_scripts: + db_script_cmd = _build_db_script_cmd(db_scripts) + db_cmd.extend(db_script_cmd) + # run colocated db in the background db_cmd.append("&") cmd.extend(db_cmd) return " ".join(cmd) + +def _build_db_model_cmd(db_models): + cmd = [] + for db_model in db_models: + cmd.append("+db_model") + cmd.append(f"--name={db_model.name}") + if db_model.file: + cmd.append(f"--file={db_model.file}") + else: + err_msg = "ML model can not be set from memory for colocated databases.\n" + err_msg += "Please store the ML model in binary format " + err_msg += "and add it to the SmartSim Model as file." + raise SSUnsupportedError(err_msg) + cmd.append(f"--backend={db_model.backend}") + cmd.append(f"--device={db_model.device}") + cmd.append(f"--devices_per_node={db_model.devices_per_node}") + if db_model.batch_size: + cmd.append(f"--batch_size={db_model.batch_size}") + if db_model.min_batch_size: + cmd.append(f"--min_batch_size={db_model.min_batch_size}") + if db_model.min_batch_timeout: + cmd.append(f"--min_batch_timeout={db_model.min_batch_timeout}") + if db_model.tag: + cmd.append(f"--tag={db_model.tag}") + if db_model.inputs: + cmd.append("--inputs="+",".join(db_model.inputs)) + if db_model.outputs: + cmd.append("--outputs="+",".join(db_model.outputs)) + + return cmd + + + + +def _build_db_script_cmd(db_scripts): + cmd = [] + for db_script in db_scripts: + cmd.append("+db_script") + cmd.append(f"--name={db_script.name}") + if db_script.func: + if not isinstance(db_script.func, str): + err_msg = "Functions can not be set from memory for colocated databases.\n" + err_msg += "Please convert the function to a string or store it as a text file " + err_msg += "and add it to the SmartSim Model with add_script." + raise SSUnsupportedError(err_msg) + + sanitized_func = db_script.func.replace("\n", "\\n") + if not (sanitized_func.startswith("'") and sanitized_func.endswith("'") + or (sanitized_func.startswith('"') and sanitized_func.endswith('"'))): + sanitized_func = "\"" + sanitized_func + "\"" + cmd.append(f"--func={sanitized_func}") + elif db_script.file: + cmd.append(f"--file={db_script.file}") + cmd.append(f"--device={db_script.device}") + cmd.append(f"--devices_per_node={db_script.devices_per_node}") + + return cmd + \ No newline at end of file diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index a493bad9a..3c40aa6ea 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -8,9 +8,13 @@ def __init__(self, name, func, file_path, device, devices_per_node): self.func = func if file_path: self.file = self._check_filepath(file_path) + else: + # Need to have this explicitly to check on it + self.file = None self.device = self._check_device(device) self.devices_per_node = devices_per_node + @property def is_file(self): if self.func: return False @@ -87,6 +91,17 @@ def __init__(self, def script(self): return self.func + def __str__(self): + desc_str = "Name: " + self.name + "\n" + if self.func: + desc_str += "Func: " + self.func + "\n" + if self.file: + desc_str += "File path: " + str(self.file) + "\n" + devices_str = self.device + ("s per node\n" if self.devices_per_node > 1 else " per node\n") + desc_str += "Devices: " + str(self.devices_per_node) + " " + devices_str + return desc_str + + class DBModel(DBObject): def __init__(self, name, @@ -120,6 +135,8 @@ def __init__(self, :type batch_size: int, optional :param min_batch_size: minimum batch size for model execution, defaults to 0 :type min_batch_size: int, optional + :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 + :type min_batch_timeout: int, optional :param tag: additional tag for model information, defaults to "" :type tag: str, optional :param inputs: model inputs (TF only), defaults to None @@ -141,3 +158,25 @@ def __init__(self, def model(self): return self.func + def __str__(self): + desc_str = "Name: " + self.name + "\n" + if self.model: + desc_str += "Model stored in memory" + if self.file: + desc_str += "File path: " + str(self.file) + "\n" + devices_str = self.device + ("s per node\n" if self.devices_per_node > 1 else " per node\n") + desc_str += "Devices: " + str(self.devices_per_node) + " " + devices_str + desc_str += "Backend: " + str(self.backend) + "\n" + if self.batch_size: + desc_str += "Batch size: " + str(self.batch_size) + "\n" + if self.min_batch_size: + desc_str += "Min batch size: " + str(self.min_batch_size) + "\n" + if self.min_batch_timeout: + desc_str += "Min batch time out: " + str(self.min_batch_timeout) + "\n" + if self.tag: + desc_str += "Tag: " + self.tag + "\n" + if self.inputs: + desc_str += "Inputs: " + str(self.inputs) + "\n" + if self.outputs: + desc_str += "Outputs: " + str(self.outputs) + "\n" + return desc_str diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 7d7cc62fa..8c52d3528 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -198,6 +198,10 @@ def colocate_db(self, colo_db_config["extra_db_args"] = dict([ (k,str(v)) for k,v in kwargs.items() if k not in colo_db_config["rai_args"] ]) + + colo_db_config["db_models"] = self._db_models + colo_db_config["db_scripts"] = self._db_scripts + self.run_settings.colocated_db_settings = colo_db_config def params_to_args(self): @@ -258,17 +262,17 @@ def add_ml_model(self, :type outputs: list[str], optional """ db_model = DBModel( - name, - backend, - model, - model_path, - device, - devices_per_node, - batch_size, - min_batch_size, - tag, - inputs, - outputs + name=name, + backend=backend, + model=model, + model_file=model_path, + device=device, + devices_per_node=devices_per_node, + batch_size=batch_size, + min_batch_size=min_batch_size, + tag=tag, + inputs=inputs, + outputs=outputs ) self._db_models.append(db_model) @@ -285,7 +289,7 @@ def add_script(self, name, script=None, script_path=None, device="CPU", devices_ Setting ``devices_per_node=N``, with N greater than one will result in the model being stored in the first N devices of type ``device``. - One of either script (in memory representation) or script_path (file) + One of either script (in memory string representation) or script_path (file) must be provided :param name: key to store script under @@ -300,11 +304,46 @@ def add_script(self, name, script=None, script_path=None, device="CPU", devices_ :type devices_per_node: int """ db_script = DBScript( - name, - script, - script_path, - device, - devices_per_node + name=name, + script=script, + script_path=script_path, + device=device, + devices_per_node=devices_per_node + ) + self._db_scripts.append(db_script) + + + def add_function(self, name, function=None, device="CPU", devices_per_node=1): + """TorchScript function to launch with this Model instance + + Each script function to the model will be loaded into a + non-converged orchestrator prior to the execution + of this Model instance. + + For converged orchestrators, the :meth:`add_script` method should be used. + + Device selection is either "GPU" or "CPU". If many devices are + present, a number can be passed for specification e.g. "GPU:1". + + Setting ``devices_per_node=N``, with N greater than one will result + in the model being stored in the first N devices of type ``device``. + + :param name: key to store function under + :type name: str + :param script: TorchScript code + :type script: str, optional + :param script_path: path to TorchScript code + :type script_path: str, optional + :param device: device for script execution, defaults to "CPU" + :type device: str, optional + :param devices_per_node: number of devices on each host + :type devices_per_node: int + """ + db_script = DBScript( + name=name, + script=function, + device=device, + devices_per_node=devices_per_node ) self._db_scripts.append(db_script) @@ -316,7 +355,11 @@ def __eq__(self, other): def __str__(self): # pragma: no cover entity_str = "Name: " + self.name + "\n" entity_str += "Type: " + self.type + "\n" - entity_str += str(self.run_settings) + entity_str += str(self.run_settings) + "\n" + if self._db_models: + entity_str += "DB Models: \n" + str(len(self._db_models)) + "\n" + if self._db_scripts: + entity_str += "DB Scripts: \n" + str(len(self._db_scripts)) + "\n" return entity_str diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index a43cc8b8d..496c28d9f 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -64,6 +64,8 @@ def serialize_model(model): :param model: TensorFlow or Keras model :type model: tf.Module + :return: serialized model, model input layer names, model output layer names + :rtype: str, list[str], list[str] """ full_model = tf.function(lambda x: model(x)) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py new file mode 100644 index 000000000..de6ca8524 --- /dev/null +++ b/tests/backends/test_dbmodel.py @@ -0,0 +1,172 @@ +import sys +import pytest + +from smartsim import Experiment, status +import smartsim +from smartsim._core.utils import installed_redisai_backends +from smartsim.error.errors import SSUnsupportedError + +should_run = True + +try: + import tensorflow.keras as keras + from tensorflow.keras.layers import Conv2D, Input +except ImportError: + should_run = False + +should_run &= "tensorflow" in installed_redisai_backends() + +class Net(keras.Model): + def __init__(self): + super(Net, self).__init__(name="cnn") + self.conv = Conv2D(1, 3, 1) + + def call(self, x): + y = self.conv(x) + return y + + +def save_tf_cnn(path, file_name): + """Create a Keras CNN for testing purposes + + """ + from smartsim.ml.tf import freeze_model + n = Net() + input_shape = (3,3,1) + n.build(input_shape=(None,*input_shape)) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return freeze_model(model, path, file_name) + + +def create_tf_cnn(): + """Create a Keras CNN for testing purposes + + """ + from smartsim.ml.tf import serialize_model + n = Net() + input_shape = (3,3,1) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return serialize_model(model) + + +@pytest.mark.skipif(not should_run, reason="Test needs TF to run") +def test_colocated_db_model(fileutils): + """Test DB Models on colocated DB""" + + exp_name = "test-colocated-db-model" + exp = Experiment(exp_name, launcher="local") + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + colo_model.colocate_db( + port=6780, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") + model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + + colo_model.add_ml_model("cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs) + colo_model.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + + # Assert we have added both models + assert(len(colo_model._db_models) == 2) + + exp.start(colo_model, block=True) + statuses = exp.get_status(colo_model) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + +@pytest.mark.skipif(not should_run, reason="Test needs TF to run") +def test_db_model(fileutils): + """Test DB Models on remote DB""" + + exp_name = "test-db-model" + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + + exp = Experiment(exp_name, exp_path=test_dir, launcher="local") + # create colocated model + run_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model.set_path(test_dir) + + db = exp.create_database(port=6780, interface="lo") + + model, inputs, outputs = create_tf_cnn() + model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + + smartsim_model.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) + smartsim_model.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + + for db_model in smartsim_model._db_models: + print(db_model) + + # Assert we have added both models + assert(len(smartsim_model._db_models) == 2) + + exp.start(db, smartsim_model, block=True) + statuses = exp.get_status(smartsim_model) + exp.stop(db) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + +@pytest.mark.skipif(not should_run or not "tensorflow" in installed_redisai_backends(), reason="Test needs TF to run") +def test_colocated_db_model_error(fileutils): + """Test error when colocated db model has no file.""" + + exp_name = "test-colocated-db-model-error" + exp = Experiment(exp_name, launcher="local") + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + colo_model.colocate_db( + port=6780, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + model, inputs, outputs = create_tf_cnn() + + colo_model.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) + + with pytest.raises(SSUnsupportedError): + exp.start(colo_model, block=True) + + diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py new file mode 100644 index 000000000..b9cc827d1 --- /dev/null +++ b/tests/backends/test_dbscript.py @@ -0,0 +1,136 @@ +import sys +import os.path as osp +import pytest + +from smartsim import Experiment, status +from smartsim._core.utils import installed_redisai_backends +from smartsim.error.errors import SSUnsupportedError + +should_run = True + +try: + import torch +except ImportError: + should_run = False + +should_run &= "torch" in installed_redisai_backends() + +def timestwo(x): + return 2*x + +@pytest.mark.skipif(not should_run, reason="Test needs Torch to run") +def test_colocated_db_script(fileutils): + """Test DB Scripts on colocated DB""" + + exp_name = "test-colocated-db-script" + exp = Experiment(exp_name, launcher="local") + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + torch_script = fileutils.get_test_conf_path("torchscript.py") + + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + colo_model.colocate_db( + port=6780, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + colo_model.add_script("test_script1", script_path=torch_script, device="CPU") + colo_model.add_script("test_script2", script=torch_script_str, device="CPU") + + # Assert we have added both models + assert(len(colo_model._db_scripts) == 2) + + for db_script in colo_model._db_scripts: + print(db_script) + + exp.start(colo_model, block=True) + statuses = exp.get_status(colo_model) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + +@pytest.mark.skipif(not should_run, reason="Test needs Torch to run") +def test_db_script(fileutils): + """Test DB scripts on remote DB""" + + exp_name = "test-db-script" + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + torch_script = fileutils.get_test_conf_path("torchscript.py") + + exp = Experiment(exp_name, exp_path=test_dir, launcher="local") + # create colocated model + run_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model.set_path(test_dir) + + db = exp.create_database(port=6780, interface="lo") + + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + smartsim_model.add_script("test_script1", script_path=torch_script, device="CPU") + smartsim_model.add_script("test_script2", script=torch_script_str, device="CPU") + smartsim_model.add_function("test_func", function=timestwo, device="CPU") + + # Assert we have added both models + assert(len(smartsim_model._db_scripts) == 3) + + exp.start(db, smartsim_model, block=True) + statuses = exp.get_status(smartsim_model) + exp.stop(db) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + +@pytest.mark.skipif(not should_run, reason="Test needs Torch to run") +def test_db_script_error(fileutils): + """Test DB Scripts error when setting a function on colocated DB""" + + exp_name = "test-colocated-db-script" + exp = Experiment(exp_name, launcher="local") + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + colo_model.colocate_db( + port=6780, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + colo_model.add_function("test_func", function=timestwo, device="CPU") + + # Assert we have added both models + assert(len(colo_model._db_scripts) == 1) + + with pytest.raises(SSUnsupportedError): + exp.start(colo_model, block=True) + \ No newline at end of file diff --git a/tests/test_configs/run_dbmodel_smartredis.py b/tests/test_configs/run_dbmodel_smartredis.py new file mode 100644 index 000000000..e94dd73dd --- /dev/null +++ b/tests/test_configs/run_dbmodel_smartredis.py @@ -0,0 +1,26 @@ +import numpy as np +from smartredis import Client + +def main(): + # address should be set as we are launching through + # SmartSim. + client = Client(cluster=False) + + array = np.ones((1, 3, 3, 1)).astype(np.single) + client.put_tensor("test_array", array) + assert client.poll_model("cnn", 500, 30) + client.run_model("cnn", ["test_array"], ["test_output"]) + returned = client.get_tensor("test_output") + + assert returned.shape == (1, 1, 1, 1) + + array = np.ones((1, 3, 3, 1)).astype(np.single) + assert client.poll_model("cnn2", 500, 30) + client.run_model("cnn2", ["test_array"], ["test_output"]) + returned = client.get_tensor("test_output") + + assert returned.shape == (1, 1, 1, 1) + print(f"Test worked!") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/test_configs/run_dbscript_smartredis.py b/tests/test_configs/run_dbscript_smartredis.py new file mode 100644 index 000000000..e88a9540c --- /dev/null +++ b/tests/test_configs/run_dbscript_smartredis.py @@ -0,0 +1,31 @@ +import numpy as np +from smartredis import Client +from pytest import approx + +def main(): + # address should be set as we are launching through + # SmartSim. + client = Client(cluster=False) + + array = np.ones((1, 3, 3, 1)).astype(np.single) + client.put_tensor("test_array", array) + assert client.poll_model("test_script1", 500, 30) + client.run_script("test_script1", "average", ["test_array"], ["test_output"]) + returned = client.get_tensor("test_output") + assert returned == approx(np.mean(array)) + + assert client.poll_model("test_script2", 500, 30) + client.run_script("test_script2", "negate", ["test_array"], ["test_output"]) + returned = client.get_tensor("test_output") + + assert returned == approx(-array) + + if client.model_exists("test_func"): + client.run_script("test_func", "timestwo", ["test_array"], ["test_output"]) + returned = client.get_tensor("test_output") + assert returned == approx(2*array) + + print(f"Test worked!") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/test_configs/torchscript.py b/tests/test_configs/torchscript.py new file mode 100644 index 000000000..ca7ccee71 --- /dev/null +++ b/tests/test_configs/torchscript.py @@ -0,0 +1,4 @@ +# import torch + +def average(x): + return torch.tensor(torch.mean(x)).unsqueeze(0) From 2e31e626f318704a050b2755f355e941709568f5 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 18 Apr 2022 18:17:12 +0200 Subject: [PATCH 05/15] Address reviewers' comments --- smartsim/_core/control/controller.py | 8 +- smartsim/_core/launcher/colocated.py | 8 +- smartsim/_core/utils/redis.py | 67 +++++++++++ smartsim/entity/__init__.py | 2 +- smartsim/entity/dbobject.py | 22 ++++ smartsim/entity/ensemble.py | 168 +++++++++++++++++++++++++++ smartsim/entity/model.py | 12 ++ 7 files changed, 281 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 205c825fd..d59d72f72 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -30,9 +30,9 @@ import threading import time -from ..._core._cli.utils import get_install_path +from ..._core.utils.redis import set_ml_model, set_script from ...database import Orchestrator -from ...entity import DBNode, DBModel, DBObject, DBScript, EntityList, SmartSimEntity +from ...entity import DBNode, EntityList, SmartSimEntity from ...error import LauncherError, SmartSimError, SSInternalError, SSUnsupportedError from ...log import get_logger from ...status import STATUS_RUNNING, TERMINAL_STATUSES @@ -42,7 +42,7 @@ from .jobmanager import JobManager from smartredis import Client -from smartredis.error import RedisConnectionError, RedisReplyError +from smartredis.error import RedisConnectionError logger = get_logger(__name__) @@ -298,6 +298,8 @@ def _launch(self, manifest): steps = [] all_entity_lists = manifest.ensembles + manifest.ray_clusters for elist in all_entity_lists: + elist.add_dbobjects_to_entities() + if elist.batch: batch_step = self._create_batch_job_step(elist) steps.append((batch_step, elist)) diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index 8602d9445..c3a8a7ec0 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -179,6 +179,9 @@ def _build_db_model_cmd(db_models): if db_model.file: cmd.append(f"--file={db_model.file}") else: + # This is caught when the DBModel is added through add_ml_model, + # but we keep this check for the sake of safety in case + # DBModels are just copied over from another entity err_msg = "ML model can not be set from memory for colocated databases.\n" err_msg += "Please store the ML model in binary format " err_msg += "and add it to the SmartSim Model as file." @@ -202,14 +205,15 @@ def _build_db_model_cmd(db_models): return cmd - - def _build_db_script_cmd(db_scripts): cmd = [] for db_script in db_scripts: cmd.append("+db_script") cmd.append(f"--name={db_script.name}") if db_script.func: + # This is caught when the DBScript is added through add_script, + # but we keep this check for the sake of safety in case + # DBScripts are just copied over from another entity if not isinstance(db_script.func, str): err_msg = "Functions can not be set from memory for colocated databases.\n" err_msg += "Please convert the function to a string or store it as a text file " diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index eabb87eb0..dd7b8b5e1 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -30,9 +30,12 @@ import redis from rediscluster import RedisCluster from rediscluster.exceptions import ClusterDownError, RedisClusterException +from smartredis import Client +from smartredis.error import RedisReplyError logging.getLogger("rediscluster").setLevel(logging.WARNING) +from ...entity import DBModel, DBScript from ...error import SSInternalError from ...log import get_logger from ..config import CONFIG @@ -110,3 +113,67 @@ def check_cluster_status(hosts, ports, trials=10): # cov-wlm trials -= 1 if trials == 0: raise SSInternalError("Cluster setup could not be verified") + + +def set_ml_model(db_model: DBModel, client: Client): + devices = db_model._enumerate_devices() + + for device in devices: + try: + if db_model.is_file: + client.set_model_from_file( + name=db_model.name, + model_file=str(db_model.file), + backend=db_model.backend, + device=device, + batch_size=db_model.batch_size, + min_batch_size=db_model.min_batch_size, + tag=db_model.tag, + inputs=db_model.inputs, + outputs=db_model.outputs + ) + else: + client.set_model( + name=db_model.name, + model=db_model.model, + backend=db_model.backend, + device=device, + batch_size=db_model.batch_size, + min_batch_size=db_model.min_batch_size, + tag=db_model.tag, + inputs=db_model.inputs, + outputs=db_model.outputs + ) + except RedisReplyError as error: + logger.error("Error while setting model on orchestrator.") + raise error + + +def set_script(db_script: DBScript, client: Client): + devices = db_script._enumerate_devices() + + for device in devices: + try: + if db_script.is_file: + client.set_script_from_file( + name=db_script.name, + file=str(db_script.file), + device=device + ) + else: + if isinstance(db_script.script, str): + client.set_script( + name=db_script.name, + script=db_script.script, + device=device + ) + else: + client.set_function( + name=db_script.name, + function=db_script.script, + device=device + ) + + except RedisReplyError as error: + logger.error("Error while setting model on orchestrator.") + raise error \ No newline at end of file diff --git a/smartsim/entity/__init__.py b/smartsim/entity/__init__.py index ab661a7f1..de7541387 100644 --- a/smartsim/entity/__init__.py +++ b/smartsim/entity/__init__.py @@ -3,4 +3,4 @@ from .entity import SmartSimEntity from .entityList import EntityList from .model import Model -from .dbobject import DBScript, DBModel, DBObject \ No newline at end of file +from .dbobject import * \ No newline at end of file diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index 3c40aa6ea..275c5612b 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -1,6 +1,7 @@ from pathlib import Path from .._core.utils.helpers import init_default +__all__ = ["DBObject", "DBModel", "DBScript"] class DBObject: def __init__(self, name, func, file_path, device, devices_per_node): @@ -54,6 +55,27 @@ def _check_device(device): raise ValueError("Device argument must start with either CPU or GPU") return device + def _enumerate_devices(self): + """Enumerate devices for a DBObject + + :param dbobject: DBObject to enumerate + :type dbobject: DBObject + :return: list of device names + :rtype: list[str] + """ + devices = [] + if ":" in self.device and self.devices_per_node > 1: + msg = "Cannot set devices_per_node>1 if a device numeral is specified, " + msg += f"the device was set to {self.device} and devices_per_node=={self.devices_per_node}" + raise ValueError(msg) + if self.device in ["CPU", "GPU"] and self.devices_per_node > 1: + for device_num in range(self.devices_per_node): + devices.append(f"{self.device}:{str(device_num)}") + else: + devices = [self.device] + + return devices + class DBScript(DBObject): def __init__(self, diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index 972579f3e..b7203d35a 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -36,6 +36,7 @@ ) from ..log import get_logger from ..settings.base import BatchSettings, RunSettings +from .dbobject import DBModel, DBScript from .entityList import EntityList from .model import Model from .strategies import create_all_permutations, random_permutations, step_values @@ -90,6 +91,8 @@ def __init__( self._key_prefixing_enabled = True self.batch_settings = init_default({}, batch_settings, BatchSettings) self.run_settings = init_default({}, run_settings, RunSettings) + self._db_models = [] + self._db_scripts = [] super().__init__(name, getcwd(), perm_strat=perm_strat, **kwargs) @property @@ -298,3 +301,168 @@ def _read_model_parameters(self): + "Must be list, int, or string." ) return param_names, parameters + + + def add_ml_model(self, + name, + backend, + model=None, + model_path=None, + device="CPU", + devices_per_node=1, + batch_size=0, + min_batch_size=0, + tag="", + inputs=None, + outputs=None): + """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime + + Each ML Model added will be loaded into an + orchestrator (converged or not) prior to the execution + of every entity belonging to this ensemble + + One of either model (in memory representation) or model_path (file) + must be provided + + :param name: key to store model under + :type name: str + :param model: model in memory + :type model: str, optional # TODO figure out what to type hint this as + :param model_path: serialized model + :type model_path: file path to model + :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) + :type backend: str + :param device: name of device for execution, defaults to "CPU" + :type device: str, optional + :param batch_size: batch size for execution, defaults to 0 + :type batch_size: int, optional + :param min_batch_size: minimum batch size for model execution, defaults to 0 + :type min_batch_size: int, optional + :param tag: additional tag for model information, defaults to "" + :type tag: str, optional + :param inputs: model inputs (TF only), defaults to None + :type inputs: list[str], optional + :param outputs: model outupts (TF only), defaults to None + :type outputs: list[str], optional + """ + db_model = DBModel( + name=name, + backend=backend, + model=model, + model_file=model_path, + device=device, + devices_per_node=devices_per_node, + batch_size=batch_size, + min_batch_size=min_batch_size, + tag=tag, + inputs=inputs, + outputs=outputs + ) + self._db_models.append(db_model) + + def add_script(self, name, script=None, script_path=None, device="CPU", devices_per_node=1): + """TorchScript to launch with every entity belonging to this ensemble + + Each script added to the model will be loaded into an + orchestrator (converged or not) prior to the execution + of every entity belonging to this ensemble + + Device selection is either "GPU" or "CPU". If many devices are + present, a number can be passed for specification e.g. "GPU:1". + + Setting ``devices_per_node=N``, with N greater than one will result + in the model being stored in the first N devices of type ``device``. + + One of either script (in memory string representation) or script_path (file) + must be provided + + :param name: key to store script under + :type name: str + :param script: TorchScript code + :type script: str, optional + :param script_path: path to TorchScript code + :type script_path: str, optional + :param device: device for script execution, defaults to "CPU" + :type device: str, optional + :param devices_per_node: number of devices on each host + :type devices_per_node: int + """ + db_script = DBScript( + name=name, + script=script, + script_path=script_path, + device=device, + devices_per_node=devices_per_node + ) + self._db_scripts.append(db_script) + + + def add_function(self, name, function=None, device="CPU", devices_per_node=1): + """TorchScript function to launch with every entity belonging to this ensemble + + Each script function to the model will be loaded into a + non-converged orchestrator prior to the execution + of every entity belonging to this ensemble. + + For converged orchestrators, the :meth:`add_script` method should be used. + + Device selection is either "GPU" or "CPU". If many devices are + present, a number can be passed for specification e.g. "GPU:1". + + Setting ``devices_per_node=N``, with N greater than one will result + in the model being stored in the first N devices of type ``device``. + + :param name: key to store function under + :type name: str + :param script: TorchScript code + :type script: str, optional + :param script_path: path to TorchScript code + :type script_path: str, optional + :param device: device for script execution, defaults to "CPU" + :type device: str, optional + :param devices_per_node: number of devices on each host + :type devices_per_node: int + """ + db_script = DBScript( + name=name, + script=function, + device=device, + devices_per_node=devices_per_node + ) + self._db_scripts.append(db_script) + + def _add_dbobjects_to_entities(self): + """Add ensemble DBObjects to each colocated entity + """ + + if self._db_models: + for entity in self.entities: + # Colocated entities are responsible for their + # DBModels, as they launch them in the entry point + if entity.colocated: + entity_db_models = [db_model.name for db_model in entity._db_models] + for db_model in self._db_models: + if db_model.is_file: + err_msg = "ML model can not be set from memory for colocated databases.\n" + err_msg += "Please store the ML model in binary format " + err_msg += "and add it to the SmartSim Model as file." + raise SSUnsupportedError(err_msg) + + if not db_model.name in entity_db_models: + entity._db_models.append(db_model) + + if self._db_scripts: + for entity in self.entities: + # Colocated entities are responsible for their + # DBScripts, as they launch them in the entry point + if entity.colocated: + entity_db_scripts = [db_script.name for db_script in entity._db_scripts] + for db_script in self._db_scripts: + if db_script.func: + if not isinstance(db_script.func, str): + err_msg = "Functions can not be set from memory for colocated databases.\n" + err_msg += "Please convert the function to a string or store it as a text file " + err_msg += "and add it to the SmartSim Model with add_script." + raise SSUnsupportedError(err_msg) + if not db_script.name in entity_db_scripts: + entity._db_scripts.append(db_script) \ No newline at end of file diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 8c52d3528..c97d9f975 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -274,6 +274,12 @@ def add_ml_model(self, inputs=inputs, outputs=outputs ) + if not db_model.is_file and self.colocated: + err_msg = "ML model can not be set from memory for colocated databases.\n" + err_msg += "Please store the ML model in binary format " + err_msg += "and add it to the SmartSim Model as file." + raise SSUnsupportedError(err_msg) + self._db_models.append(db_model) def add_script(self, name, script=None, script_path=None, device="CPU", devices_per_node=1): @@ -310,6 +316,12 @@ def add_script(self, name, script=None, script_path=None, device="CPU", devices_ device=device, devices_per_node=devices_per_node ) + if db_script.func and self.colocated: + if not isinstance(db_script.func, str): + err_msg = "Functions can not be set from memory for colocated databases.\n" + err_msg += "Please convert the function to a string or store it as a text file " + err_msg += "and add it to the SmartSim Model with add_script." + raise SSUnsupportedError(err_msg) self._db_scripts.append(db_script) From 8f6ba4bb17007cf05913809bae798c6f1f2ab23a Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 1 Apr 2022 16:14:40 -0500 Subject: [PATCH 06/15] First commit --- smartsim/_core/control/controller.py | 15 ++++++++++----- smartsim/_core/launcher/colocated.py | 3 +++ smartsim/entity/__init__.py | 2 +- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index d59d72f72..a8e5c32e7 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -32,7 +32,7 @@ from ..._core.utils.redis import set_ml_model, set_script from ...database import Orchestrator -from ...entity import DBNode, EntityList, SmartSimEntity +from ...entity import DBNode, DBModel, DBObject, DBScript, EntityList, SmartSimEntity from ...error import LauncherError, SmartSimError, SSInternalError, SSUnsupportedError from ...log import get_logger from ...status import STATUS_RUNNING, TERMINAL_STATUSES @@ -75,10 +75,15 @@ def start(self, manifest, block=True, kill_on_interrupt=True): The controller will start the job-manager thread upon execution of all jobs. """ - self._jobs.kill_on_interrupt = kill_on_interrupt - # register custom signal handler for ^C (SIGINT) - signal.signal(signal.SIGINT, self._jobs.signal_interrupt) - self._launch(manifest) + try: + self._launch(manifest) + + # start the job manager thread if not already started + if not self._jobs.actively_monitoring: + self._jobs.start() + + if self.orchestrator_active(): + self.set_dbobjects(manifest) # start the job manager thread if not already started if not self._jobs.actively_monitoring: diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index c3a8a7ec0..b9013c9f3 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -68,6 +68,9 @@ def write_colocated_launch_script(file_name, db_log, colocated_settings): f.write(f"{colocated_cmd}\n") f.write(f"DBPID=$!\n\n") + if colocated_settings["db_models"]: + pass + if colocated_settings["limit_app_cpus"]: cpus = colocated_settings["cpus"] f.write( diff --git a/smartsim/entity/__init__.py b/smartsim/entity/__init__.py index de7541387..3595b6c96 100644 --- a/smartsim/entity/__init__.py +++ b/smartsim/entity/__init__.py @@ -3,4 +3,4 @@ from .entity import SmartSimEntity from .entityList import EntityList from .model import Model -from .dbobject import * \ No newline at end of file +from .dbobject import * From 004b85e55535be5d0e78e21205e2803f054c556a Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 5 Apr 2022 11:53:26 -0500 Subject: [PATCH 07/15] Working add_script, add_function, add_model --- smartsim/_core/control/controller.py | 4 ++-- smartsim/_core/launcher/colocated.py | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index a8e5c32e7..870068b1e 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -82,8 +82,8 @@ def start(self, manifest, block=True, kill_on_interrupt=True): if not self._jobs.actively_monitoring: self._jobs.start() - if self.orchestrator_active(): - self.set_dbobjects(manifest) + if self.orchestrator_active: + self._set_dbobjects(manifest) # start the job manager thread if not already started if not self._jobs.actively_monitoring: diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index b9013c9f3..c3a8a7ec0 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -68,9 +68,6 @@ def write_colocated_launch_script(file_name, db_log, colocated_settings): f.write(f"{colocated_cmd}\n") f.write(f"DBPID=$!\n\n") - if colocated_settings["db_models"]: - pass - if colocated_settings["limit_app_cpus"]: cpus = colocated_settings["cpus"] f.write( From 21bc88a0084760dceb6c08b9c8f85dd7431e6328 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 18 Apr 2022 18:17:12 +0200 Subject: [PATCH 08/15] Address reviewers' comments --- smartsim/_core/control/controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 870068b1e..0dba0fca7 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -32,7 +32,7 @@ from ..._core.utils.redis import set_ml_model, set_script from ...database import Orchestrator -from ...entity import DBNode, DBModel, DBObject, DBScript, EntityList, SmartSimEntity +from ...entity import DBNode, EntityList, SmartSimEntity from ...error import LauncherError, SmartSimError, SSInternalError, SSUnsupportedError from ...log import get_logger from ...status import STATUS_RUNNING, TERMINAL_STATUSES From 0d8d7ae481c158ac6e494a5ecdf416cccbf271d8 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 18 Apr 2022 19:37:06 +0200 Subject: [PATCH 09/15] Fix rebase --- smartsim/_core/control/controller.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 0dba0fca7..d59d72f72 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -75,15 +75,10 @@ def start(self, manifest, block=True, kill_on_interrupt=True): The controller will start the job-manager thread upon execution of all jobs. """ - try: - self._launch(manifest) - - # start the job manager thread if not already started - if not self._jobs.actively_monitoring: - self._jobs.start() - - if self.orchestrator_active: - self._set_dbobjects(manifest) + self._jobs.kill_on_interrupt = kill_on_interrupt + # register custom signal handler for ^C (SIGINT) + signal.signal(signal.SIGINT, self._jobs.signal_interrupt) + self._launch(manifest) # start the job manager thread if not already started if not self._jobs.actively_monitoring: From b9f83d1972180c5b763197c75f815972e6111ada Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 18 Apr 2022 21:01:32 +0200 Subject: [PATCH 10/15] Fix function name --- smartsim/_core/control/controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index d59d72f72..b7a3e1f4a 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -298,7 +298,7 @@ def _launch(self, manifest): steps = [] all_entity_lists = manifest.ensembles + manifest.ray_clusters for elist in all_entity_lists: - elist.add_dbobjects_to_entities() + elist._add_dbobjects_to_entities() if elist.batch: batch_step = self._create_batch_job_step(elist) From 352f570b967d42e8ca872fe484833f634cd54cd2 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 18 Apr 2022 22:22:22 +0200 Subject: [PATCH 11/15] Update dbobject tests --- smartsim/_core/control/controller.py | 34 ++++++++++++++++++++++++++++ smartsim/entity/dbobject.py | 2 +- smartsim/entity/model.py | 7 ++++++ tests/backends/test_dbmodel.py | 5 ++-- tests/backends/test_dbscript.py | 9 +++----- 5 files changed, 47 insertions(+), 10 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index b7a3e1f4a..87150cac7 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -294,6 +294,9 @@ def _launch(self, manifest): for rc in manifest.ray_clusters: rc._update_workers() + if self.orchestrator_active: + self._set_dbobjects(manifest) + # create all steps prior to launch steps = [] all_entity_lists = manifest.ensembles + manifest.ray_clusters @@ -593,3 +596,34 @@ def reload_saved_db(self, checkpoint_file): finally: JM_LOCK.release() + + def _set_dbobjects(self, manifest): + db_addresses = self._jobs.get_db_host_addresses() + cluster = len(db_addresses) > 1 + address = db_addresses[0] + + try: + client = Client(address=address, cluster=cluster) + except RedisConnectionError as error: + logger.error("Could not connect to orchestrator") + raise error + + for model in manifest.models: + if not model.colocated: + for db_model in model._db_models: + set_ml_model(db_model, client) + for db_script in model._db_scripts: + set_script(db_script, client) + + for ensemble in manifest.ensembles: + for db_model in ensemble._db_models: + for entity in ensemble: + set_ml_model(db_model, client) + for db_script in ensemble._db_scripts: + for entity in ensemble: + set_script(db_script, client) + for entity in ensemble: + for db_model in entity._db_models: + set_ml_model(db_model, client) + for db_script in entity._db_scripts: + set_script(db_script, client) \ No newline at end of file diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index 275c5612b..4f3dfa009 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -183,7 +183,7 @@ def model(self): def __str__(self): desc_str = "Name: " + self.name + "\n" if self.model: - desc_str += "Model stored in memory" + desc_str += "Model stored in memory\n" if self.file: desc_str += "File path: " + str(self.file) + "\n" devices_str = self.device + ("s per node\n" if self.devices_per_node > 1 else " per node\n") diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index c97d9f975..b35ca4e89 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -274,6 +274,7 @@ def add_ml_model(self, inputs=inputs, outputs=outputs ) + if not db_model.is_file and self.colocated: err_msg = "ML model can not be set from memory for colocated databases.\n" err_msg += "Please store the ML model in binary format " @@ -357,6 +358,12 @@ def add_function(self, name, function=None, device="CPU", devices_per_node=1): device=device, devices_per_node=devices_per_node ) + if db_script.func and self.colocated: + if not isinstance(db_script.func, str): + err_msg = "Functions can not be set from memory for colocated databases.\n" + err_msg += "Please convert the function to a string or store it as a text file " + err_msg += "and add it to the SmartSim Model with add_script." + raise SSUnsupportedError(err_msg) self._db_scripts.append(db_script) def __eq__(self, other): diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index de6ca8524..4c9b30267 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -116,6 +116,7 @@ def test_db_model(fileutils): smartsim_model.set_path(test_dir) db = exp.create_database(port=6780, interface="lo") + exp.generate(db) model, inputs, outputs = create_tf_cnn() model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") @@ -164,9 +165,7 @@ def test_colocated_db_model_error(fileutils): model, inputs, outputs = create_tf_cnn() - colo_model.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) - with pytest.raises(SSUnsupportedError): - exp.start(colo_model, block=True) + colo_model.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index b9cc827d1..27685c6b1 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -84,6 +84,7 @@ def test_db_script(fileutils): smartsim_model.set_path(test_dir) db = exp.create_database(port=6780, interface="lo") + exp.generate(db) torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" @@ -126,11 +127,7 @@ def test_db_script_error(fileutils): ifname="lo" ) - colo_model.add_function("test_func", function=timestwo, device="CPU") - - # Assert we have added both models - assert(len(colo_model._db_scripts) == 1) - with pytest.raises(SSUnsupportedError): - exp.start(colo_model, block=True) + colo_model.add_function("test_func", function=timestwo, device="CPU") + \ No newline at end of file From 1d3672cec69cda96a3cc139976c6c5276ccaa512 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 19 Apr 2022 18:01:36 +0200 Subject: [PATCH 12/15] Add DBObject functionality to ensembles --- smartsim/_core/control/controller.py | 22 ++- smartsim/_core/utils/redis.py | 3 + smartsim/entity/ensemble.py | 73 ++++---- smartsim/entity/model.py | 28 ++- tests/backends/test_dbmodel.py | 256 +++++++++++++++++++++++--- tests/backends/test_dbscript.py | 265 ++++++++++++++++++++++++--- 6 files changed, 553 insertions(+), 94 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 87150cac7..ab68f372c 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -301,8 +301,6 @@ def _launch(self, manifest): steps = [] all_entity_lists = manifest.ensembles + manifest.ray_clusters for elist in all_entity_lists: - elist._add_dbobjects_to_entities() - if elist.batch: batch_step = self._create_batch_job_step(elist) steps.append((batch_step, elist)) @@ -617,13 +615,17 @@ def _set_dbobjects(self, manifest): for ensemble in manifest.ensembles: for db_model in ensemble._db_models: - for entity in ensemble: - set_ml_model(db_model, client) + set_ml_model(db_model, client) for db_script in ensemble._db_scripts: - for entity in ensemble: - set_script(db_script, client) + set_script(db_script, client) for entity in ensemble: - for db_model in entity._db_models: - set_ml_model(db_model, client) - for db_script in entity._db_scripts: - set_script(db_script, client) \ No newline at end of file + if not entity.colocated: + # Set models which could belong only + # to the entities and not to the ensemble + # but avoid duplicates + for db_model in entity._db_models: + if db_model not in ensemble._db_models: + set_ml_model(db_model, client) + for db_script in entity._db_scripts: + if db_script not in ensemble._db_scripts: + set_script(db_script, client) \ No newline at end of file diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index dd7b8b5e1..7b3e4163f 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -116,6 +116,7 @@ def check_cluster_status(hosts, ports, trials=10): # cov-wlm def set_ml_model(db_model: DBModel, client: Client): + logger.debug(f"Adding DBModel named {db_model.name}") devices = db_model._enumerate_devices() for device in devices: @@ -150,6 +151,8 @@ def set_ml_model(db_model: DBModel, client: Client): def set_script(db_script: DBScript, client: Client): + logger.debug(f"Adding DBScript named {db_script.name}") + devices = db_script._enumerate_devices() for device in devices: diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index b7203d35a..ad166a94d 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -189,6 +189,12 @@ def add_model(self, model): raise EntityExistsError( f"Model {model.name} already exists in ensemble {self.name}" ) + + if self._db_models: + self._extend_entity_db_models(model, self._db_models) + if self._db_scripts: + self._extend_entity_db_scripts(model, self._db_scripts) + self.entities.append(model) def register_incoming_entity(self, incoming_entity): @@ -359,6 +365,9 @@ def add_ml_model(self, outputs=outputs ) self._db_models.append(db_model) + for entity in self: + self._extend_entity_db_models(entity, [db_model]) + def add_script(self, name, script=None, script_path=None, device="CPU", devices_per_node=1): """TorchScript to launch with every entity belonging to this ensemble @@ -395,6 +404,8 @@ def add_script(self, name, script=None, script_path=None, device="CPU", devices_ devices_per_node=devices_per_node ) self._db_scripts.append(db_script) + for entity in self: + self._extend_entity_db_scripts(entity, [db_script]) def add_function(self, name, function=None, device="CPU", devices_per_node=1): @@ -430,39 +441,31 @@ def add_function(self, name, function=None, device="CPU", devices_per_node=1): devices_per_node=devices_per_node ) self._db_scripts.append(db_script) + for entity in self: + self._extend_entity_db_scripts(entity, [db_script]) - def _add_dbobjects_to_entities(self): - """Add ensemble DBObjects to each colocated entity - """ - - if self._db_models: - for entity in self.entities: - # Colocated entities are responsible for their - # DBModels, as they launch them in the entry point - if entity.colocated: - entity_db_models = [db_model.name for db_model in entity._db_models] - for db_model in self._db_models: - if db_model.is_file: - err_msg = "ML model can not be set from memory for colocated databases.\n" - err_msg += "Please store the ML model in binary format " - err_msg += "and add it to the SmartSim Model as file." - raise SSUnsupportedError(err_msg) - - if not db_model.name in entity_db_models: - entity._db_models.append(db_model) - - if self._db_scripts: - for entity in self.entities: - # Colocated entities are responsible for their - # DBScripts, as they launch them in the entry point - if entity.colocated: - entity_db_scripts = [db_script.name for db_script in entity._db_scripts] - for db_script in self._db_scripts: - if db_script.func: - if not isinstance(db_script.func, str): - err_msg = "Functions can not be set from memory for colocated databases.\n" - err_msg += "Please convert the function to a string or store it as a text file " - err_msg += "and add it to the SmartSim Model with add_script." - raise SSUnsupportedError(err_msg) - if not db_script.name in entity_db_scripts: - entity._db_scripts.append(db_script) \ No newline at end of file + def _extend_entity_db_models(self, model, db_models): + + entity_db_models = [db_model.name for db_model in model._db_models] + for db_model in db_models: + if not db_model.name in entity_db_models: + if model.colocated and not db_model.is_file: + err_msg = "ML model can not be set from memory for colocated databases.\n" + err_msg += f"Please store the ML model named {model.name} in binary format " + err_msg += "and add it to the SmartSim Model as file." + raise SSUnsupportedError(err_msg) + + model._db_models.append(db_model) + + def _extend_entity_db_scripts(self, model, db_scripts): + + entity_db_scripts = [db_script.name for db_script in model._db_scripts] + for db_script in db_scripts: + if not db_script.name in entity_db_scripts: + if db_script.func and model.colocated and not isinstance(db_script.func, str): + err_msg = "Functions can not be set from memory for colocated databases.\n" + err_msg += f"Please convert the function named {db_script.name} to a string " + err_msg += "or store it as a text file and add it to the SmartSim Model with add_script." + raise SSUnsupportedError(err_msg) + + model._db_scripts.append(db_script) \ No newline at end of file diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index b35ca4e89..5625b9399 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -199,6 +199,7 @@ def colocate_db(self, (k,str(v)) for k,v in kwargs.items() if k not in colo_db_config["rai_args"] ]) + self._check_db_objects_colo() colo_db_config["db_models"] = self._db_models colo_db_config["db_scripts"] = self._db_scripts @@ -274,10 +275,10 @@ def add_ml_model(self, inputs=inputs, outputs=outputs ) - + if not db_model.is_file and self.colocated: err_msg = "ML model can not be set from memory for colocated databases.\n" - err_msg += "Please store the ML model in binary format " + err_msg += f"Please store the ML model named {db_model.name} in binary format " err_msg += "and add it to the SmartSim Model as file." raise SSUnsupportedError(err_msg) @@ -320,8 +321,8 @@ def add_script(self, name, script=None, script_path=None, device="CPU", devices_ if db_script.func and self.colocated: if not isinstance(db_script.func, str): err_msg = "Functions can not be set from memory for colocated databases.\n" - err_msg += "Please convert the function to a string or store it as a text file " - err_msg += "and add it to the SmartSim Model with add_script." + err_msg += f"Please convert the function named {db_script.name} to a string or store " + err_msg += "it as a text file and add it to the SmartSim Model with add_script." raise SSUnsupportedError(err_msg) self._db_scripts.append(db_script) @@ -361,8 +362,8 @@ def add_function(self, name, function=None, device="CPU", devices_per_node=1): if db_script.func and self.colocated: if not isinstance(db_script.func, str): err_msg = "Functions can not be set from memory for colocated databases.\n" - err_msg += "Please convert the function to a string or store it as a text file " - err_msg += "and add it to the SmartSim Model with add_script." + err_msg += f"Please convert the function named {db_script.name} to a string or store " + err_msg += "it as a text file and add it to the SmartSim Model with add_script." raise SSUnsupportedError(err_msg) self._db_scripts.append(db_script) @@ -381,4 +382,19 @@ def __str__(self): # pragma: no cover entity_str += "DB Scripts: \n" + str(len(self._db_scripts)) + "\n" return entity_str + def _check_db_objects_colo(self): + + for db_model in self._db_models: + if not db_model.is_file: + err_msg = "ML model can not be set from memory for colocated databases.\n" + err_msg += f"Please store the ML model named {db_model.name} in binary format " + err_msg += "and add it to the SmartSim Model as file." + raise SSUnsupportedError(err_msg) + for db_script in self._db_scripts: + if db_script.func: + if not isinstance(db_script.func, str): + err_msg = "Functions can not be set from memory for colocated databases.\n" + err_msg += f"Please convert the function named {db_script.name} to a string or store it " + err_msg += "as a text file and add it to the SmartSim Model with add_script." + raise SSUnsupportedError(err_msg) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 4c9b30267..6a1870d50 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -2,7 +2,6 @@ import pytest from smartsim import Experiment, status -import smartsim from smartsim._core.utils import installed_redisai_backends from smartsim.error.errors import SSUnsupportedError @@ -55,6 +54,97 @@ def create_tf_cnn(): return serialize_model(model) +@pytest.mark.skipif(not should_run, reason="Test needs TF to run") +def test_db_model(fileutils): + """Test DB Models on remote DB""" + + exp_name = "test-db-model" + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + + exp = Experiment(exp_name, exp_path=test_dir, launcher="local") + # create colocated model + run_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model.set_path(test_dir) + + db = exp.create_database(port=6780, interface="lo") + exp.generate(db) + + model, inputs, outputs = create_tf_cnn() + model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + + smartsim_model.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) + smartsim_model.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + + for db_model in smartsim_model._db_models: + print(db_model) + + # Assert we have added both models + assert(len(smartsim_model._db_models) == 2) + + exp.start(db, smartsim_model, block=True) + statuses = exp.get_status(smartsim_model) + exp.stop(db) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + +@pytest.mark.skipif(not should_run, reason="Test needs TF to run") +def test_db_model_ensemble(fileutils): + """Test DBModels on remote DB, with an ensemble""" + + exp_name = "test-db-model-ensemble" + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + + exp = Experiment(exp_name, exp_path=test_dir, launcher="local") + # create colocated model + run_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + smartsim_ensemble = exp.create_ensemble("smartsim_model", run_settings=run_settings, replicas=2) + smartsim_ensemble.set_path(test_dir) + + smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model.set_path(test_dir) + + db = exp.create_database(port=6780, interface="lo") + exp.generate(db) + + model, inputs, outputs = create_tf_cnn() + model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + + smartsim_ensemble.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) + + for entity in smartsim_ensemble: + entity.disable_key_prefixing() + entity.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + + # Ensemble must add all available DBModels to new entity + smartsim_ensemble.add_model(smartsim_model) + smartsim_model.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + + # Assert we have added one model to the ensemble + assert(len(smartsim_ensemble._db_models) == 1) + # Assert we have added two models to each entity + assert(all([len(entity._db_models)==2 for entity in smartsim_ensemble])) + + exp.start(db, smartsim_ensemble, block=True) + statuses = exp.get_status(smartsim_ensemble) + exp.stop(db) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + @pytest.mark.skipif(not should_run, reason="Test needs TF to run") def test_colocated_db_model(fileutils): """Test DB Models on colocated DB""" @@ -95,49 +185,134 @@ def test_colocated_db_model(fileutils): statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + +@pytest.mark.skipif(not should_run, reason="Test needs TF to run") +def test_colocated_db_model_ensemble(fileutils): + """Test DBModel on colocated ensembles, first colocating DB, + then adding DBModel. + """ + + exp_name = "test-colocated-db-model-ensemble" + + # get test setup + test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) + sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_ensemble = exp.create_ensemble("colocated_ens", run_settings=colo_settings, replicas=2) + colo_ensemble.set_path(test_dir) + + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + colo_model.colocate_db( + port=6780, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") + model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + + for i, entity in enumerate(colo_ensemble): + entity.colocate_db( + port=6780+i, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + # Test that models added individually do not conflict with enemble ones + entity.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + + # Test adding a model from ensemble + colo_ensemble.add_ml_model("cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs) + + # Ensemble should add all available DBModels to new model + colo_ensemble.add_model(colo_model) + colo_model.colocate_db( + port=6780+len(colo_ensemble), + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + colo_model.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + + + exp.start(colo_ensemble, block=True) + statuses = exp.get_status(colo_ensemble) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + @pytest.mark.skipif(not should_run, reason="Test needs TF to run") -def test_db_model(fileutils): - """Test DB Models on remote DB""" +def test_colocated_db_model_ensemble_reordered(fileutils): + """Test DBModel on colocated ensembles, first adding the DBModel to the + ensemble, then colocating DB. + """ - exp_name = "test-db-model" + exp_name = "test-colocated-db-model-ensemble-reordered" # get test setup test_dir = fileutils.make_test_dir() + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") - exp = Experiment(exp_name, exp_path=test_dir, launcher="local") # create colocated model - run_settings = exp.create_run_settings( + colo_settings = exp.create_run_settings( exe=sys.executable, exe_args=sr_test_script ) - smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) + colo_ensemble = exp.create_ensemble("colocated_ens", run_settings=colo_settings, replicas=2) + colo_ensemble.set_path(test_dir) - db = exp.create_database(port=6780, interface="lo") - exp.generate(db) + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) - model, inputs, outputs = create_tf_cnn() + model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") - smartsim_model.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) - smartsim_model.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + # Test adding a model from ensemble + colo_ensemble.add_ml_model("cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs) - for db_model in smartsim_model._db_models: - print(db_model) + for i, entity in enumerate(colo_ensemble): + entity.colocate_db( + port=6780+i, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + # Test that models added individually do not conflict with enemble ones + entity.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) - # Assert we have added both models - assert(len(smartsim_model._db_models) == 2) - exp.start(db, smartsim_model, block=True) - statuses = exp.get_status(smartsim_model) - exp.stop(db) + # Ensemble should add all available DBModels to new model + colo_ensemble.add_model(colo_model) + colo_model.colocate_db( + port=6780+len(colo_ensemble), + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + colo_model.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + + exp.start(colo_ensemble, block=True) + statuses = exp.get_status(colo_ensemble) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -@pytest.mark.skipif(not should_run or not "tensorflow" in installed_redisai_backends(), reason="Test needs TF to run") -def test_colocated_db_model_error(fileutils): +@pytest.mark.skipif(not should_run, reason="Test needs TF to run") +def test_colocated_db_model_errors(fileutils): """Test error when colocated db model has no file.""" exp_name = "test-colocated-db-model-error" @@ -169,3 +344,40 @@ def test_colocated_db_model_error(fileutils): colo_model.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) + colo_ensemble = exp.create_ensemble("colocated_ens", run_settings=colo_settings, replicas=2) + colo_ensemble.set_path(test_dir) + for i, entity in enumerate(colo_ensemble): + entity.colocate_db( + port=6780+i, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + with pytest.raises(SSUnsupportedError): + colo_ensemble.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) + + # Check errors for reverse order of DBModel addition and DB colocation + # create colocated model + colo_settings2 = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + # Reverse order of DBModel and model + colo_ensemble2 = exp.create_ensemble("colocated_ens", run_settings=colo_settings2, replicas=2) + colo_ensemble2.set_path(test_dir) + colo_ensemble2.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) + for i, entity in enumerate(colo_ensemble2): + with pytest.raises(SSUnsupportedError): + entity.colocate_db( + port=6780+i, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + with pytest.raises(SSUnsupportedError): + colo_ensemble.add_model(colo_model) \ No newline at end of file diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index 27685c6b1..5af4e3024 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -1,5 +1,4 @@ import sys -import os.path as osp import pytest from smartsim import Experiment, status @@ -18,6 +17,98 @@ def timestwo(x): return 2*x + +@pytest.mark.skipif(not should_run, reason="Test needs Torch to run") +def test_db_script(fileutils): + """Test DB scripts on remote DB""" + + exp_name = "test-db-script" + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + torch_script = fileutils.get_test_conf_path("torchscript.py") + + exp = Experiment(exp_name, exp_path=test_dir, launcher="local") + # create colocated model + run_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model.set_path(test_dir) + + db = exp.create_database(port=6780, interface="lo") + exp.generate(db) + + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + smartsim_model.add_script("test_script1", script_path=torch_script, device="CPU") + smartsim_model.add_script("test_script2", script=torch_script_str, device="CPU") + smartsim_model.add_function("test_func", function=timestwo, device="CPU") + + # Assert we have all three models + assert(len(smartsim_model._db_scripts) == 3) + + exp.start(db, smartsim_model, block=True) + statuses = exp.get_status(smartsim_model) + exp.stop(db) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + +@pytest.mark.skipif(not should_run, reason="Test needs Torch to run") +def test_db_script_ensemble(fileutils): + """Test DB scripts on remote DB""" + + exp_name = "test-db-script" + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + torch_script = fileutils.get_test_conf_path("torchscript.py") + + exp = Experiment(exp_name, exp_path=test_dir, launcher="local") + # create colocated model + run_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + ensemble = exp.create_ensemble("dbscript_ensemble", run_settings=run_settings, replicas=2) + ensemble.set_path(test_dir) + + smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model.set_path(test_dir) + + db = exp.create_database(port=6780, interface="lo") + exp.generate(db) + + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + ensemble.add_script("test_script1", script_path=torch_script, device="CPU") + + for entity in ensemble: + entity.disable_key_prefixing() + entity.add_script("test_script2", script=torch_script_str, device="CPU") + + ensemble.add_function("test_func", function=timestwo, device="CPU") + + # Ensemble must add all available DBScripts to new entity + ensemble.add_model(smartsim_model) + smartsim_model.add_script("test_script2", script=torch_script_str, device="CPU") + + # Assert we have added both models to the ensemble + assert(len(ensemble._db_scripts) == 2) + # Assert we have added all three models to entities in ensemble + assert(all([len(entity._db_scripts) == 3 for entity in ensemble])) + + exp.start(db, ensemble, block=True) + statuses = exp.get_status(ensemble) + exp.stop(db) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") def test_colocated_db_script(fileutils): """Test DB Scripts on colocated DB""" @@ -63,46 +154,134 @@ def test_colocated_db_script(fileutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script(fileutils): - """Test DB scripts on remote DB""" +def test_colocated_db_script_ensemble(fileutils): + """Test DB Scripts on colocated DB from ensemble, first colocating DB, + then adding script. + """ - exp_name = "test-db-script" + exp_name = "test-colocated-db-script" + exp = Experiment(exp_name, launcher="local") # get test setup test_dir = fileutils.make_test_dir() sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") - exp = Experiment(exp_name, exp_path=test_dir, launcher="local") # create colocated model - run_settings = exp.create_run_settings( + colo_settings = exp.create_run_settings( exe=sys.executable, exe_args=sr_test_script ) - smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) + colo_ensemble = exp.create_ensemble("colocated_ensemble", run_settings=colo_settings, replicas=2) + colo_ensemble.set_path(test_dir) - db = exp.create_database(port=6780, interface="lo") - exp.generate(db) + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + for i, entity in enumerate(colo_ensemble): + entity.disable_key_prefixing() + entity.colocate_db( + port=6780+i, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + entity.add_script("test_script1", script_path=torch_script, device="CPU") + + colo_model.colocate_db( + port=6780+len(colo_ensemble), + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" - smartsim_model.add_script("test_script1", script_path=torch_script, device="CPU") - smartsim_model.add_script("test_script2", script=torch_script_str, device="CPU") - smartsim_model.add_function("test_func", function=timestwo, device="CPU") + colo_ensemble.add_script("test_script2", script=torch_script_str, device="CPU") - # Assert we have added both models - assert(len(smartsim_model._db_scripts) == 3) + colo_ensemble.add_model(colo_model) + colo_model.add_script("test_script1", script_path=torch_script, device="CPU") - exp.start(db, smartsim_model, block=True) - statuses = exp.get_status(smartsim_model) - exp.stop(db) + # Assert we have added one model to the ensemble + assert(len(colo_ensemble._db_scripts) == 1) + # Assert we have added both models to each entity + assert(all([len(entity._db_scripts)==2 for entity in colo_ensemble])) + + + exp.start(colo_ensemble, block=True) + statuses = exp.get_status(colo_ensemble) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_error(fileutils): - """Test DB Scripts error when setting a function on colocated DB""" +def test_colocated_db_script_ensemble_reordered(fileutils): + """Test DB Scripts on colocated DB from ensemble, first adding the + script to the ensemble, then colocating the DB""" + + exp_name = "test-colocated-db-script" + exp = Experiment(exp_name, launcher="local") + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + torch_script = fileutils.get_test_conf_path("torchscript.py") + + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_ensemble = exp.create_ensemble("colocated_ensemble", run_settings=colo_settings, replicas=2) + colo_ensemble.set_path(test_dir) + + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + colo_ensemble.add_script("test_script2", script=torch_script_str, device="CPU") + + for i, entity in enumerate(colo_ensemble): + entity.disable_key_prefixing() + entity.colocate_db( + port=6780+i, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + entity.add_script("test_script1", script_path=torch_script, device="CPU") + + colo_model.colocate_db( + port=6780+len(colo_ensemble), + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + colo_ensemble.add_model(colo_model) + colo_model.add_script("test_script1", script_path=torch_script, device="CPU") + + # Assert we have added one model to the ensemble + assert(len(colo_ensemble._db_scripts) == 1) + # Assert we have added both models to each entity + assert(all([len(entity._db_scripts)==2 for entity in colo_ensemble])) + + + exp.start(colo_ensemble, block=True) + statuses = exp.get_status(colo_ensemble) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + +@pytest.mark.skipif(not should_run, reason="Test needs Torch to run") +def test_db_script_errors(fileutils): + """Test DB Scripts error when setting a serialized function on colocated DB""" exp_name = "test-colocated-db-script" exp = Experiment(exp_name, launcher="local") @@ -130,4 +309,48 @@ def test_db_script_error(fileutils): with pytest.raises(SSUnsupportedError): colo_model.add_function("test_func", function=timestwo, device="CPU") - \ No newline at end of file + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_ensemble = exp.create_ensemble("colocated_ensemble", run_settings=colo_settings, replicas=2) + colo_ensemble.set_path(test_dir) + + for i, entity in enumerate(colo_ensemble): + entity.colocate_db( + port=6780+i, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + with pytest.raises(SSUnsupportedError): + colo_ensemble.add_function("test_func", function=timestwo, device="CPU") + + # create colocated model + colo_settings = exp.create_run_settings( + exe=sys.executable, + exe_args=sr_test_script + ) + + colo_ensemble = exp.create_ensemble("colocated_ensemble", run_settings=colo_settings, replicas=2) + colo_ensemble.set_path(test_dir) + + colo_ensemble.add_function("test_func", function=timestwo, device="CPU") + + for i, entity in enumerate(colo_ensemble): + with pytest.raises(SSUnsupportedError): + entity.colocate_db( + port=6780+i, + db_cpus=1, + limit_app_cpus=False, + debug=True, + ifname="lo" + ) + + + with pytest.raises(SSUnsupportedError): + colo_ensemble.add_model(colo_model) \ No newline at end of file From f5de152562b61707f9fda3b295fbf3150b2537e9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 20 Apr 2022 10:21:57 +0200 Subject: [PATCH 13/15] Improve coverage for DBObject code --- smartsim/_core/control/controller.py | 4 ++-- smartsim/_core/launcher/colocated.py | 26 +++++++------------------- smartsim/_core/utils/redis.py | 4 ++-- tests/backends/test_dbmodel.py | 6 +++--- 4 files changed, 14 insertions(+), 26 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index ab68f372c..c2aa92b72 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -291,7 +291,7 @@ def _launch(self, manifest): raise SmartSimError(msg) self._launch_orchestrator(orchestrator) - for rc in manifest.ray_clusters: + for rc in manifest.ray_clusters: # cov-wlm rc._update_workers() if self.orchestrator_active: @@ -602,7 +602,7 @@ def _set_dbobjects(self, manifest): try: client = Client(address=address, cluster=cluster) - except RedisConnectionError as error: + except RedisConnectionError as error: # pragma: no cover logger.error("Could not connect to orchestrator") raise error diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index c3a8a7ec0..0431f3ef0 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -176,16 +176,11 @@ def _build_db_model_cmd(db_models): for db_model in db_models: cmd.append("+db_model") cmd.append(f"--name={db_model.name}") - if db_model.file: - cmd.append(f"--file={db_model.file}") - else: - # This is caught when the DBModel is added through add_ml_model, - # but we keep this check for the sake of safety in case - # DBModels are just copied over from another entity - err_msg = "ML model can not be set from memory for colocated databases.\n" - err_msg += "Please store the ML model in binary format " - err_msg += "and add it to the SmartSim Model as file." - raise SSUnsupportedError(err_msg) + + # Here db_model.file is guaranteed to exist + # because we don't allow the user to pass a serialized DBModel + cmd.append(f"--file={db_model.file}") + cmd.append(f"--backend={db_model.backend}") cmd.append(f"--device={db_model.device}") cmd.append(f"--devices_per_node={db_model.devices_per_node}") @@ -211,15 +206,8 @@ def _build_db_script_cmd(db_scripts): cmd.append("+db_script") cmd.append(f"--name={db_script.name}") if db_script.func: - # This is caught when the DBScript is added through add_script, - # but we keep this check for the sake of safety in case - # DBScripts are just copied over from another entity - if not isinstance(db_script.func, str): - err_msg = "Functions can not be set from memory for colocated databases.\n" - err_msg += "Please convert the function to a string or store it as a text file " - err_msg += "and add it to the SmartSim Model with add_script." - raise SSUnsupportedError(err_msg) - + # Notice that here db_script.func is guaranteed to be a str + # because we don't allow the user to pass a serialized function sanitized_func = db_script.func.replace("\n", "\\n") if not (sanitized_func.startswith("'") and sanitized_func.endswith("'") or (sanitized_func.startswith('"') and sanitized_func.endswith('"'))): diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 7b3e4163f..6ddbcb61d 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -145,7 +145,7 @@ def set_ml_model(db_model: DBModel, client: Client): inputs=db_model.inputs, outputs=db_model.outputs ) - except RedisReplyError as error: + except RedisReplyError as error: # pragma: no cover logger.error("Error while setting model on orchestrator.") raise error @@ -177,6 +177,6 @@ def set_script(db_script: DBScript, client: Client): device=device ) - except RedisReplyError as error: + except RedisReplyError as error: # pragma: no cover logger.error("Error while setting model on orchestrator.") raise error \ No newline at end of file diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 6a1870d50..deb6a2661 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -80,8 +80,8 @@ def test_db_model(fileutils): model, inputs, outputs = create_tf_cnn() model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") - smartsim_model.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs) - smartsim_model.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) + smartsim_model.add_ml_model("cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs, tag="test") + smartsim_model.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2, tag="test") for db_model in smartsim_model._db_models: print(db_model) @@ -233,7 +233,7 @@ def test_colocated_db_model_ensemble(fileutils): entity.add_ml_model("cnn2", "TF", model_path=model_file2, device="CPU", inputs=inputs2, outputs=outputs2) # Test adding a model from ensemble - colo_ensemble.add_ml_model("cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs) + colo_ensemble.add_ml_model("cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs, tag="test") # Ensemble should add all available DBModels to new model colo_ensemble.add_model(colo_model) From ec8c9180c4368c4121fcf3ccb5cdeb52152fc132 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 21 Apr 2022 17:43:59 +0200 Subject: [PATCH 14/15] Remove duplicate exception catches, reuse DB check --- smartsim/_core/control/controller.py | 21 ++++++++----- smartsim/_core/control/manifest.py | 32 ++++++++++++++++++++ smartsim/_core/utils/__init__.py | 2 +- smartsim/_core/utils/redis.py | 36 ++++++++++++++++++++++ smartsim/database/orchestrator.py | 23 ++------------ smartsim/entity/ensemble.py | 18 ++--------- smartsim/entity/model.py | 45 +++++++++++++++------------- 7 files changed, 111 insertions(+), 66 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index c2aa92b72..58900f548 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -30,7 +30,7 @@ import threading import time -from ..._core.utils.redis import set_ml_model, set_script +from ..._core.utils.redis import db_is_active, set_ml_model, set_script from ...database import Orchestrator from ...entity import DBNode, EntityList, SmartSimEntity from ...error import LauncherError, SmartSimError, SSInternalError, SSUnsupportedError @@ -596,16 +596,21 @@ def reload_saved_db(self, checkpoint_file): def _set_dbobjects(self, manifest): + if not manifest.has_db_objects: + return + db_addresses = self._jobs.get_db_host_addresses() - cluster = len(db_addresses) > 1 - address = db_addresses[0] - try: - client = Client(address=address, cluster=cluster) - except RedisConnectionError as error: # pragma: no cover - logger.error("Could not connect to orchestrator") - raise error + hosts = list(set([address.split(":")[0] for address in db_addresses])) + ports = list(set([address.split(":")[-1] for address in db_addresses])) + + if not db_is_active(hosts=hosts, + ports=ports, + num_shards=len(db_addresses)): + raise SSInternalError("Cannot set DB Objects, DB is not running") + client = Client(address=db_addresses[0], cluster=len(db_addresses) > 1) + for model in manifest.models: if not model.colocated: for db_model in model._db_models: diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 00ad10913..1da253a67 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -194,3 +194,35 @@ def __str__(self): s += "\n" return s + + @property + def has_db_objects(self): + """Check if any entity has DBObjects to set + """ + def has_db_models(entity): + if hasattr(entity, "_db_models"): + return len(entity._db_models) > 0 + def has_db_scripts(entity): + if hasattr(entity, "_db_scripts"): + return len(entity._db_scripts) > 0 + + has_db_objects = False + for model in self.models: + has_db_objects |= hasattr(model, "_db_models") + has_db_objects |= any([has_db_models(model) | has_db_scripts(model) for model in self.models]) + if has_db_objects: + return True + + ensembles = self.ensembles + if not ensembles: + return False + + has_db_objects |= any([has_db_models(ensemble) | has_db_scripts(ensemble) for ensemble in ensembles]) + if has_db_objects: + return True + for ensemble in ensembles: + has_db_objects |= any([has_db_models(model) | has_db_scripts(model) for model in ensemble]) + if has_db_objects: + return True + + return has_db_objects \ No newline at end of file diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py index 211f30e6b..05cfb446e 100644 --- a/smartsim/_core/utils/__init__.py +++ b/smartsim/_core/utils/__init__.py @@ -1,2 +1,2 @@ from .helpers import colorize, delete_elements, init_default, installed_redisai_backends -from .redis import check_cluster_status, create_cluster +from .redis import check_cluster_status, create_cluster, db_is_active diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 6ddbcb61d..5659ee0e2 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -115,6 +115,42 @@ def check_cluster_status(hosts, ports, trials=10): # cov-wlm raise SSInternalError("Cluster setup could not be verified") +def db_is_active(hosts, ports, num_shards): + """Check if a DB is running + + if the DB is clustered, check cluster status, otherwise + just ping DB. + + :param hosts: list of hosts + :type hosts: list[str] + :param ports: list of ports + :type ports: list[int] + :param num_shards: Number of DB shards + :type num_shards: int + :return: Whether DB is running + :rtype: bool + """ + # if single shard + if num_shards < 2: + host = hosts[0] + port = ports[0] + try: + client = redis.Redis(host=host, port=port, db=0) + if client.ping(): + return True + return False + except redis.RedisError: + return False + # if a cluster + else: + try: + check_cluster_status(hosts, ports, trials=1) + return True + # we expect this to fail if the cluster is not active + except SSInternalError: + return False + + def set_ml_model(db_model: DBModel, client: Client): logger.debug(f"Adding DBModel named {db_model.name}") devices = db_model._enumerate_devices() diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 57bdd20ef..f45329b07 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -34,7 +34,7 @@ from smartredis import Client from smartredis.error import RedisReplyError -from .._core.utils import check_cluster_status +from .._core.utils import db_is_active from .._core.config import CONFIG from .._core.utils.helpers import is_valid_cmd from .._core.utils.network import get_ip_from_host @@ -261,25 +261,8 @@ def is_active(self): if not self._hosts: return False - # if single shard - if self.num_shards < 2: - host = self._hosts[0] - port = self.ports[0] - try: - client = redis.Redis(host=host, port=port, db=0) - if client.ping(): - return True - return False - except redis.RedisError: - return False - # if a cluster - else: - try: - check_cluster_status(self._hosts, self.ports, trials=1) - return True - # we expect this to fail if the cluster is not active - except SSInternalError: - return False + return db_is_active(self._hosts, self.ports, self.num_shards) + @property def _rai_module(self): diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index ad166a94d..9f1794327 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -445,27 +445,13 @@ def add_function(self, name, function=None, device="CPU", devices_per_node=1): self._extend_entity_db_scripts(entity, [db_script]) def _extend_entity_db_models(self, model, db_models): - entity_db_models = [db_model.name for db_model in model._db_models] for db_model in db_models: if not db_model.name in entity_db_models: - if model.colocated and not db_model.is_file: - err_msg = "ML model can not be set from memory for colocated databases.\n" - err_msg += f"Please store the ML model named {model.name} in binary format " - err_msg += "and add it to the SmartSim Model as file." - raise SSUnsupportedError(err_msg) - - model._db_models.append(db_model) + model._append_db_model(db_model) def _extend_entity_db_scripts(self, model, db_scripts): - entity_db_scripts = [db_script.name for db_script in model._db_scripts] for db_script in db_scripts: if not db_script.name in entity_db_scripts: - if db_script.func and model.colocated and not isinstance(db_script.func, str): - err_msg = "Functions can not be set from memory for colocated databases.\n" - err_msg += f"Please convert the function named {db_script.name} to a string " - err_msg += "or store it as a text file and add it to the SmartSim Model with add_script." - raise SSUnsupportedError(err_msg) - - model._db_scripts.append(db_script) \ No newline at end of file + model._append_db_script(db_script) \ No newline at end of file diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 5625b9399..887bc8f04 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -275,14 +275,9 @@ def add_ml_model(self, inputs=inputs, outputs=outputs ) + self._append_db_model(db_model) + - if not db_model.is_file and self.colocated: - err_msg = "ML model can not be set from memory for colocated databases.\n" - err_msg += f"Please store the ML model named {db_model.name} in binary format " - err_msg += "and add it to the SmartSim Model as file." - raise SSUnsupportedError(err_msg) - - self._db_models.append(db_model) def add_script(self, name, script=None, script_path=None, device="CPU", devices_per_node=1): """TorchScript to launch with this Model instance @@ -318,13 +313,8 @@ def add_script(self, name, script=None, script_path=None, device="CPU", devices_ device=device, devices_per_node=devices_per_node ) - if db_script.func and self.colocated: - if not isinstance(db_script.func, str): - err_msg = "Functions can not be set from memory for colocated databases.\n" - err_msg += f"Please convert the function named {db_script.name} to a string or store " - err_msg += "it as a text file and add it to the SmartSim Model with add_script." - raise SSUnsupportedError(err_msg) - self._db_scripts.append(db_script) + self._append_db_script(db_script) + def add_function(self, name, function=None, device="CPU", devices_per_node=1): @@ -359,13 +349,7 @@ def add_function(self, name, function=None, device="CPU", devices_per_node=1): device=device, devices_per_node=devices_per_node ) - if db_script.func and self.colocated: - if not isinstance(db_script.func, str): - err_msg = "Functions can not be set from memory for colocated databases.\n" - err_msg += f"Please convert the function named {db_script.name} to a string or store " - err_msg += "it as a text file and add it to the SmartSim Model with add_script." - raise SSUnsupportedError(err_msg) - self._db_scripts.append(db_script) + self._append_db_script(db_script) def __eq__(self, other): if self.name == other.name: @@ -382,6 +366,25 @@ def __str__(self): # pragma: no cover entity_str += "DB Scripts: \n" + str(len(self._db_scripts)) + "\n" return entity_str + + def _append_db_model(self, db_model): + if not db_model.is_file and self.colocated: + err_msg = "ML model can not be set from memory for colocated databases.\n" + err_msg += f"Please store the ML model named {db_model.name} in binary format " + err_msg += "and add it to the SmartSim Model as file." + raise SSUnsupportedError(err_msg) + + self._db_models.append(db_model) + + def _append_db_script(self, db_script): + if db_script.func and self.colocated: + if not isinstance(db_script.func, str): + err_msg = "Functions can not be set from memory for colocated databases.\n" + err_msg += f"Please convert the function named {db_script.name} to a string or store " + err_msg += "it as a text file and add it to the SmartSim Model with add_script." + raise SSUnsupportedError(err_msg) + self._db_scripts.append(db_script) + def _check_db_objects_colo(self): for db_model in self._db_models: From 5cf503c4086b300a5f80ed080cf803d4c268cc9b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 25 Apr 2022 15:39:14 +0200 Subject: [PATCH 15/15] Minor adjustments following review --- smartsim/_core/control/manifest.py | 14 +++++++++++++- smartsim/_core/entrypoints/colocated.py | 4 ++-- smartsim/entity/model.py | 4 ++-- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 1da253a67..652f6f625 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -199,6 +199,7 @@ def __str__(self): def has_db_objects(self): """Check if any entity has DBObjects to set """ + def has_db_models(entity): if hasattr(entity, "_db_models"): return len(entity._db_models) > 0 @@ -206,23 +207,34 @@ def has_db_scripts(entity): if hasattr(entity, "_db_scripts"): return len(entity._db_scripts) > 0 + has_db_objects = False for model in self.models: has_db_objects |= hasattr(model, "_db_models") + + # Check if any model has either a DBModel or a DBScript + # we update has_db_objects so that as soon as one check + # returns True, we can exit has_db_objects |= any([has_db_models(model) | has_db_scripts(model) for model in self.models]) if has_db_objects: return True + # If there are no ensembles, there can be no outstanding model + # to check for DBObjects, return current value of DBObjects, which + # should be False ensembles = self.ensembles if not ensembles: - return False + return has_db_objects + # First check if there is any ensemble DBObject, if so, return True has_db_objects |= any([has_db_models(ensemble) | has_db_scripts(ensemble) for ensemble in ensembles]) if has_db_objects: return True for ensemble in ensembles: + # Last case, check if any model within an ensemble has DBObjects attached has_db_objects |= any([has_db_models(model) | has_db_scripts(model) for model in ensemble]) if has_db_objects: return True + # `has_db_objects` should be False here return has_db_objects \ No newline at end of file diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index 484050ce3..1cd12f7f9 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -38,7 +38,7 @@ from subprocess import PIPE, STDOUT from smartredis import Client -from smartredis.error import RedisConnectionError +from smartredis.error import RedisConnectionError, RedisReplyError from smartsim._core.utils.network import current_ip from smartsim.error import SSInternalError from smartsim.log import get_logger @@ -216,7 +216,7 @@ def main(network_interface: str, db_cpus: int, command: List[str], db_models: Li logger.debug(f"Added script {script_name} ({i+1}/{len(db_scripts)})") # Make sure we don't keep this around del client - except RedisConnectionError: + except (RedisConnectionError, RedisReplyError): raise SSInternalError("Failed to set model or script, could not connect to database") diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 887bc8f04..c4a969cf6 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -244,7 +244,7 @@ def add_ml_model(self, :param name: key to store model under :type name: str :param model: model in memory - :type model: str, optional # TODO figure out what to type hint this as + :type model: byte string, optional :param model_path: serialized model :type model_path: file path to model :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) @@ -335,7 +335,7 @@ def add_function(self, name, function=None, device="CPU", devices_per_node=1): :param name: key to store function under :type name: str :param script: TorchScript code - :type script: str, optional + :type script: str or byte string, optional :param script_path: path to TorchScript code :type script_path: str, optional :param device: device for script execution, defaults to "CPU"