From 02b9069ea436b89af101af1c81c8eb56eb99b4dd Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Wed, 14 Jun 2023 09:30:25 -0700 Subject: [PATCH 01/25] Fix mixing Model functions in the documentation summary. --- doc/api/smartsim_api.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index 4b953f89a..f6e876867 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -431,6 +431,10 @@ Model Model.colocate_db Model.colocate_db_tcp Model.colocate_db_uds + Model.colocated + Model.add_ml_model + Model.add_script + Model.add_function Model.params_to_args Model.register_incoming_entity Model.enable_key_prefixing From 9784109e17bd278071197f426ad8a214c1ff4360 Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Wed, 14 Jun 2023 09:34:17 -0700 Subject: [PATCH 02/25] Update changelog. --- doc/changelog.rst | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index 77054b42c..d432722c6 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -25,6 +25,7 @@ Description A full list of changes and detailed notes can be found below: +- Fix add_ml_model() and add_script() documentation, tests, and code - Remove wait time associated with Experiment launch summary - Update and rename Redis conf file - Migrate from redis-py-cluster to redis-py @@ -38,8 +39,11 @@ A full list of changes and detailed notes can be found below: Detailed notes -- Fix defect where dictionaries used to create run settings can be changed +- Fix defect where dictionaries used to create run settings can be changed unexpectedly due to copy-by-ref (PR305_) +- The underlying code for Model.add_ml_model() and Model.add_script() was fixed + to correctly handle multi-GPU configurations. Tests were updated to run on + non-local launchers. Documentation was updated and fixed. (PR304_) - Typehints have been added. A makefile target `make check-mypy` executes static analysis with mypy. (PR295_, PR303_) - Simplify code in `random_permutations` parameter generation strategy (PR300_) @@ -48,17 +52,18 @@ Detailed notes - Migrate from redis-py-cluster to redis-py for cluster status checks (PR292_) - Update full test suite to no longer require a tensorflow wheel to be available at test time. (PR291_) - Correct spelling of colocated in doc strings (PR290_) -- Deprecated launcher-specific orchestrators, constants, and ML +- Deprecated launcher-specific orchestrators, constants, and ML utilities were removed. (PR289_) - Relax the coloredlogs version to be greater than 10.0 (PR288_) - Update the Github Actions runner image from `macos-10.15`` to `macos-12``. The former began deprecation in May 2022 and was finally removed in May 2023. (PR285_) -- The Fortran tutorials had not been fully updated to show how to handle +- The Fortran tutorials had not been fully updated to show how to handle return/error codes. These have now all been updated. (PR284_) -- Orchestrator and Colocated DB now accept a list of interfaces to bind to. The +- Orchestrator and Colocated DB now accept a list of interfaces to bind to. The argument name is still `interface` for backward compatibility reasons. (PR281_) .. _PR305: https://github.com/CrayLabs/SmartSim/pull/305 +.. _PR304: https://github.com/CrayLabs/SmartSim/pull/304 .. _PR303: https://github.com/CrayLabs/SmartSim/pull/303 .. _PR300: https://github.com/CrayLabs/SmartSim/pull/300 .. _PR298: https://github.com/CrayLabs/SmartSim/pull/298 From 39241e294d7785e27af17ad2ef68b100d42f37dc Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Wed, 14 Jun 2023 15:10:31 -0700 Subject: [PATCH 03/25] Fix the functionality of setting models and scripts on multiple GPUs. --- smartsim/_core/entrypoints/colocated.py | 48 ++++++++++++------------- smartsim/entity/model.py | 18 +++++++--- 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index e03449a3f..29b63cce4 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -92,7 +92,22 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: if args.outputs: outputs = list(args.outputs) - if args.devices_per_node == 1: + # devices_per_node being greater than one only applies + # to GPU devices + if args.devices_per_node > 1 and args.device.lower() == "gpu": + client.set_model_from_file_multigpu( + args.name, + args.file, + args.backend, + 0, + args.devices_per_node, + args.batch_size, + args.min_batch_size, + args.tag, + inputs, + outputs + ) + else: client.set_model_from_file( args.name, args.file, @@ -102,21 +117,8 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: args.min_batch_size, args.tag, inputs, - outputs, + outputs ) - else: - for device_num in range(args.devices_per_node): - client.set_model_from_file( - args.name, - args.file, - args.backend, - args.device + f":{device_num}", - args.batch_size, - args.min_batch_size, - args.tag, - inputs, - outputs, - ) return args.name @@ -142,19 +144,15 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: if args.func: func = args.func.replace("\\n", "\n") - if args.devices_per_node == 1: - client.set_script(args.name, func, args.device) + if args.devices_per_node > 1 and args.device.lower() == "gpu": + client.set_script_multigpu(args.name, func, 0, args.devices_per_node) else: - for device_num in range(args.devices_per_node): - client.set_script(args.name, func, args.device + f":{device_num}") + client.set_script(args.name, func, args.device) elif args.file: - if args.devices_per_node == 1: - client.set_script_from_file(args.name, args.file, args.device) + if args.devices_per_node > 1 and args.device.lower() == "gpu": + client.set_script_from_file_multigpu(args.name, args.file, 0, args.devices_per_node) else: - for device_num in range(args.devices_per_node): - client.set_script_from_file( - args.name, args.file, args.device + f":{device_num}" - ) + client.set_script_from_file(args.name, args.file, args.device) return args.name diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 3729aa720..c17f6834d 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -75,7 +75,7 @@ def __init__( self._db_models: t.List[DBModel] = [] self._db_scripts: t.List[DBScript] = [] self.files: t.Optional[EntityFiles] = None - + @property def colocated(self) -> bool: """Return True if this Model will run with a colocated Orchestrator""" @@ -356,14 +356,18 @@ def add_ml_model( :param name: key to store model under :type name: str + :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) + :type backend: str :param model: model in memory :type model: byte string, optional :param model_path: serialized model :type model_path: file path to model - :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) - :type backend: str :param device: name of device for execution, defaults to "CPU" :type device: str, optional + :param devices_per_node: The number of GPU devices available on the host. + This parameter only applies to GPU devices and will be ignored if device + is specified as GPU. + :type devices_per_node: int :param batch_size: batch size for execution, defaults to 0 :type batch_size: int, optional :param min_batch_size: minimum batch size for model execution, defaults to 0 @@ -421,7 +425,9 @@ def add_script( :type script_path: str, optional :param device: device for script execution, defaults to "CPU" :type device: str, optional - :param devices_per_node: number of devices on each host + :param devices_per_node: The number of GPU devices available on the host. + This parameter only applies to GPU devices and will be ignored if device + is specified as GPU. :type devices_per_node: int """ db_script = DBScript( @@ -462,7 +468,9 @@ def add_function( :type script_path: str, optional :param device: device for script execution, defaults to "CPU" :type device: str, optional - :param devices_per_node: number of devices on each host + :param devices_per_node: The number of GPU devices available on the host. + This parameter only applies to GPU devices and will be ignored if device + is specified as GPU. :type devices_per_node: int """ db_script = DBScript( From 1480fe26749374498aa31b5ce9ce9adfcc7bdb7a Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Thu, 15 Jun 2023 09:29:50 -0700 Subject: [PATCH 04/25] Update multi-db code and improve colocated db tests. --- conftest.py | 6 +++ smartsim/_core/config/config.py | 8 +++ tests/backends/test_dbmodel.py | 96 ++++++++++++++++++++++++--------- tests/backends/test_tf.py | 2 +- 4 files changed, 87 insertions(+), 25 deletions(-) diff --git a/conftest.py b/conftest.py index 823e7fc53..25a8a76a4 100644 --- a/conftest.py +++ b/conftest.py @@ -51,6 +51,7 @@ test_dir = os.path.join(test_path, "tests", "test_output") test_launcher = CONFIG.test_launcher test_device = CONFIG.test_device +test_num_gpus = CONFIG.test_num_gpus test_nic = CONFIG.test_interface test_alloc_specs_path = os.getenv("SMARTSIM_TEST_ALLOC_SPEC_SHEET_PATH", None) test_port = CONFIG.test_port @@ -591,3 +592,8 @@ class MLUtils: def get_test_device(): global test_device return test_device + + @staticmethod + def get_test_num_gpus(): + global test_num_gpus + return test_num_gpus diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 832958f13..42efb7441 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -79,6 +79,10 @@ # - CPU or GPU for model serving tests # - Default: CPU # +# SMARTSIM_TEST_NUM_GPUS +# - Number of GPUs on the host for testing +# - Defaults: 1 +# # SMARTSIM_TEST_ACCOUNT # - Account used to run full launcher test suite on external systems # - Default: None @@ -158,6 +162,10 @@ def test_launcher(self) -> str: # pragma: no cover def test_device(self) -> str: # pragma: no cover return os.environ.get("SMARTSIM_TEST_DEVICE", "CPU") + @property + def test_num_gpus(self) -> str: # pragma: no cover + return os.environ.get("SMARTSIM_TEST_NUM_GPUS", 1) + @property def test_port(self) -> int: # pragma: no cover return int(os.environ.get("SMARTSIM_TEST_PORT", 6780)) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index b0e393db8..15e4ded98 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -131,7 +131,7 @@ def save_torch_cnn(path, file_name): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_tf_db_model(fileutils, wlmutils): +def test_tf_db_model(fileutils, wlmutils, mlutils): """Test TensorFlow DB Models on remote DB""" exp_name = "test-tf-db-model" @@ -153,11 +153,15 @@ def test_tf_db_model(fileutils, wlmutils): model, inputs, outputs = create_tf_cnn() model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() + smartsim_model.add_ml_model( "cnn", "TF", model=model, - device="CPU", + device=test_device, + devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs, tag="test", @@ -166,7 +170,8 @@ def test_tf_db_model(fileutils, wlmutils): "cnn2", "TF", model_path=model_file2, - device="CPU", + device=test_device, + devices_per_node=test_num_gpus, inputs=inputs2, outputs=outputs2, tag="test", @@ -185,7 +190,7 @@ def test_tf_db_model(fileutils, wlmutils): @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_pt_db_model(fileutils, wlmutils): +def test_pt_db_model(fileutils, wlmutils, mlutils): """Test PyTorch DB Models on remote DB""" exp_name = "test-pt-db-model" @@ -207,11 +212,15 @@ def test_pt_db_model(fileutils, wlmutils): save_torch_cnn(test_dir, "model1.pt") model_path = test_dir + "/model1.pt" + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() + smartsim_model.add_ml_model( "cnn", "TORCH", model_path=model_path, - device="CPU", + device=test_device, + devices_per_node=test_num_gpus, tag="test", ) @@ -228,7 +237,7 @@ def test_pt_db_model(fileutils, wlmutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_db_model_ensemble(fileutils, wlmutils): +def test_db_model_ensemble(fileutils, wlmutils, mlutils): """Test DBModels on remote DB, with an ensemble""" exp_name = "test-db-model-ensemble" @@ -259,13 +268,17 @@ def test_db_model_ensemble(fileutils, wlmutils): "cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs ) + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() + for entity in smartsim_ensemble: entity.disable_key_prefixing() entity.add_ml_model( "cnn2", "TF", model_path=model_file2, - device="CPU", + device=test_device, + devices_per_node=test_num_gpus, inputs=inputs2, outputs=outputs2, ) @@ -276,7 +289,8 @@ def test_db_model_ensemble(fileutils, wlmutils): "cnn2", "TF", model_path=model_file2, - device="CPU", + device=test_device, + devices_per_node=test_num_gpus, inputs=inputs2, outputs=outputs2, ) @@ -293,7 +307,7 @@ def test_db_model_ensemble(fileutils, wlmutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_tf(fileutils, wlmutils): +def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): """Test DB Models on colocated DB (TensorFlow backend)""" exp_name = "test-colocated-db-model-tf" @@ -319,14 +333,24 @@ def test_colocated_db_model_tf(fileutils, wlmutils): model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() + colo_model.add_ml_model( - "cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs + "cnn", + "TF", + model_path=model_file, + device=test_device, + devices_per_node=test_num_gpus, + inputs=inputs, + outputs=outputs ) colo_model.add_ml_model( "cnn2", "TF", model_path=model_file2, - device="CPU", + device=test_device, + devices_per_node=test_num_gpus, inputs=inputs2, outputs=outputs2, ) @@ -352,7 +376,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils): @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_colocated_db_model_pytorch(fileutils, wlmutils): +def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): """Test DB Models on colocated DB (PyTorch backend)""" exp_name = "test-colocated-db-model-pytorch" @@ -375,9 +399,16 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils): ifname="lo", ) + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() + save_torch_cnn(test_dir, "model1.pt") model_file = test_dir + "/model1.pt" - colo_model.add_ml_model("cnn", "TORCH", model_path=model_file, device="CPU") + colo_model.add_ml_model("cnn", + "TORCH", + model_path=model_file, + device=test_device, + devices_per_node=test_num_gpus) # Assert we have added both models assert len(colo_model._db_models) == 1 @@ -388,7 +419,7 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble(fileutils, wlmutils): +def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): """Test DBModel on colocated ensembles, first colocating DB, then adding DBModel. """ @@ -421,6 +452,9 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils): model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() + for i, entity in enumerate(colo_ensemble): entity.colocate_db( port=wlmutils.get_test_port() + i, @@ -434,7 +468,8 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils): "cnn2", "TF", model_path=model_file2, - device="CPU", + device=test_device, + devices_per_node=test_num_gpus, inputs=inputs2, outputs=outputs2, ) @@ -444,7 +479,8 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils): "cnn", "TF", model_path=model_file, - device="CPU", + device=test_device, + devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs, tag="test", @@ -463,7 +499,8 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils): "cnn2", "TF", model_path=model_file2, - device="CPU", + device=test_device, + devices_per_node=test_num_gpus, inputs=inputs2, outputs=outputs2, ) @@ -474,7 +511,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils): +def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): """Test DBModel on colocated ensembles, first adding the DBModel to the ensemble, then colocating DB. """ @@ -500,6 +537,9 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils): model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() + # Test adding a model from ensemble colo_ensemble.add_ml_model( "cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs @@ -518,7 +558,8 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils): "cnn2", "TF", model_path=model_file2, - device="CPU", + device=test_device, + devices_per_node=test_num_gpus, inputs=inputs2, outputs=outputs2, ) @@ -536,7 +577,8 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils): "cnn2", "TF", model_path=model_file2, - device="CPU", + device=test_device, + devices_per_node=test_num_gpus, inputs=inputs2, outputs=outputs2, ) @@ -547,7 +589,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_errors(fileutils, wlmutils): +def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): """Test error when colocated db model has no file.""" exp_name = "test-colocated-db-model-error" @@ -572,9 +614,13 @@ def test_colocated_db_model_errors(fileutils, wlmutils): model, inputs, outputs = create_tf_cnn() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() + with pytest.raises(SSUnsupportedError): colo_model.add_ml_model( - "cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs + "cnn", "TF", model=model, device=test_device, + devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs ) colo_ensemble = exp.create_ensemble( @@ -592,7 +638,8 @@ def test_colocated_db_model_errors(fileutils, wlmutils): with pytest.raises(SSUnsupportedError): colo_ensemble.add_ml_model( - "cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs + "cnn", "TF", model=model, device=test_device, + devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs ) # Check errors for reverse order of DBModel addition and DB colocation @@ -607,7 +654,8 @@ def test_colocated_db_model_errors(fileutils, wlmutils): ) colo_ensemble2.set_path(test_dir) colo_ensemble2.add_ml_model( - "cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs + "cnn", "TF", model=model, device=test_device, + devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs ) for i, entity in enumerate(colo_ensemble2): with pytest.raises(SSUnsupportedError): diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py index 2bf327b16..e30ad4f24 100644 --- a/tests/backends/test_tf.py +++ b/tests/backends/test_tf.py @@ -54,7 +54,7 @@ def test_keras_model(fileutils, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 for a keras model script this test can run on CPU/GPU by setting SMARTSIM_TEST_DEVICE=GPU - Similarly, the test can excute on any launcher by setting SMARTSIM_TEST_LAUNCHER + Similarly, the test can execute on any launcher by setting SMARTSIM_TEST_LAUNCHER which is local by default. You may need to put CUDNN in your LD_LIBRARY_PATH if running on GPU From 186cb10ce4b98a8027bcf9fe57611cbe02affeed Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Thu, 15 Jun 2023 23:18:55 -0700 Subject: [PATCH 05/25] Fix DBModel and DBScript tests to be able to run on WLM tests with GPUs. --- smartsim/entity/model.py | 4 +- tests/backends/test_dbmodel.py | 397 ++++++++++++++++++++----------- tests/backends/test_dbscript.py | 403 ++++++++++++++++++++++++-------- 3 files changed, 563 insertions(+), 241 deletions(-) diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index c17f6834d..d191aa6d3 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -358,7 +358,7 @@ def add_ml_model( :type name: str :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) :type backend: str - :param model: model in memory + :param model: A model in memory (only supported for non-colocated orchestrators) :type model: byte string, optional :param model_path: serialized model :type model_path: file path to model @@ -419,7 +419,7 @@ def add_script( :param name: key to store script under :type name: str - :param script: TorchScript code + :param script: TorchScript code (only supported for non-colocated orchestrators) :type script: str, optional :param script_path: path to TorchScript code :type script_path: str, optional diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 15e4ded98..54fa65a43 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -34,6 +34,9 @@ from smartsim import Experiment, status from smartsim._core.utils import installed_redisai_backends from smartsim.error.errors import SSUnsupportedError +from smartsim.log import get_logger + +logger = get_logger(__name__) should_run_tf = True should_run_pt = True @@ -134,28 +137,37 @@ def save_torch_cnn(path, file_name): def test_tf_db_model(fileutils, wlmutils, mlutils): """Test TensorFlow DB Models on remote DB""" + # Set experiment name exp_name = "test-tf-db-model" - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") - exp = Experiment(exp_name, exp_path=test_dir, launcher="local") - # create colocated model - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create the SmartSim Experiment + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + # Create RunSettings + run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + + # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) smartsim_model.set_path(test_dir) - db = exp.create_database(port=wlmutils.get_test_port(), interface="lo") + # Create database + db = exp.create_database(port=test_port, interface=test_interface) exp.generate(db) + # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") - test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - + # Add ML model to the SmartSim model smartsim_model.add_ml_model( "cnn", "TF", @@ -177,44 +189,57 @@ def test_tf_db_model(fileutils, wlmutils, mlutils): tag="test", ) + logger.debug("The following ML models have been added:") for db_model in smartsim_model._db_models: - print(db_model) + logger.debug(db_model) # Assert we have added both models assert len(smartsim_model._db_models) == 2 - exp.start(db, smartsim_model, block=True) - statuses = exp.get_status(smartsim_model) - exp.stop(db) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + # Launch and check successful completion + try: + exp.start(db, smartsim_model, block=True) + statuses = exp.get_status(smartsim_model) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + exp.stop(db) @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") def test_pt_db_model(fileutils, wlmutils, mlutils): """Test PyTorch DB Models on remote DB""" + # Set experiment name exp_name = "test-pt-db-model" - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") + test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") + + # Create the SmartSim Experiment + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - exp = Experiment(exp_name, exp_path=test_dir, launcher="local") - # create colocated model - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create RunSettings + run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) smartsim_model.set_path(test_dir) - db = exp.create_database(port=wlmutils.get_test_port(), interface="lo") + # Create database + db = exp.create_database(port=test_port, interface=test_interface) exp.generate(db) + # Create and save ML model to filesystem save_torch_cnn(test_dir, "model1.pt") model_path = test_dir + "/model1.pt" - test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - + # Add ML model to the SmartSim model smartsim_model.add_ml_model( "cnn", "TORCH", @@ -224,53 +249,75 @@ def test_pt_db_model(fileutils, wlmutils, mlutils): tag="test", ) + logger.debug("The following ML models have been added:") for db_model in smartsim_model._db_models: - print(db_model) + logger.debug(db_model) + # Assert we have added both models assert len(smartsim_model._db_models) == 1 - exp.start(db, smartsim_model, block=True) - statuses = exp.get_status(smartsim_model) - exp.stop(db) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + # Launch and check successful completion + try: + exp.start(db, smartsim_model, block=True) + statuses = exp.get_status(smartsim_model) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + exp.stop(db) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") def test_db_model_ensemble(fileutils, wlmutils, mlutils): """Test DBModels on remote DB, with an ensemble""" + # Set experiment name exp_name = "test-db-model-ensemble" - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + + # Create the SmartSim Experiment + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - exp = Experiment(exp_name, exp_path=test_dir, launcher="local") - # create colocated model - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create RunSettings + run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + # Create ensemble smartsim_ensemble = exp.create_ensemble( "smartsim_model", run_settings=run_settings, replicas=2 ) smartsim_ensemble.set_path(test_dir) + # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) smartsim_model.set_path(test_dir) - db = exp.create_database(port=wlmutils.get_test_port(), interface="lo") + # Create database + db = exp.create_database(port=test_port, interface=test_interface) exp.generate(db) + # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") + # Add the first ML model to all of the ensemble members smartsim_ensemble.add_ml_model( - "cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs + "cnn", + "TF", + model=model, + device=test_device, + devices_per_node=test_num_gpus, + inputs=inputs, + outputs=outputs ) - test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - + # Add the second ML model individually to each SmartSim model for entity in smartsim_ensemble: entity.disable_key_prefixing() entity.add_ml_model( @@ -283,8 +330,11 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): outputs=outputs2, ) - # Ensemble must add all available DBModels to new entity + # Add new ensemble member smartsim_ensemble.add_model(smartsim_model) + + # Add the second ML model to the newly added entity. This is + # because the test script run both ML models for all entities. smartsim_model.add_ml_model( "cnn2", "TF", @@ -300,42 +350,53 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): # Assert we have added two models to each entity assert all([len(entity._db_models) == 2 for entity in smartsim_ensemble]) - exp.start(db, smartsim_ensemble, block=True) - statuses = exp.get_status(smartsim_ensemble) - exp.stop(db) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + # Launch and check successful completion + try: + exp.start(db, smartsim_ensemble, block=True) + statuses = exp.get_status(smartsim_ensemble) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + exp.stop(db) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): """Test DB Models on colocated DB (TensorFlow backend)""" + # Set experiment name exp_name = "test-colocated-db-model-tf" - exp = Experiment(exp_name, launcher="local") - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = "lo" + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") - # create colocated model - colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create SmartSim Experience + exp = Experiment(exp_name, launcher=test_launcher) + # Create RunSettings + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + + # Create colocated Model colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) colo_model.colocate_db( - port=wlmutils.get_test_port(), + port=test_port, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface ) + # Create and save ML model to filesystem model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") - test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - + # Add ML models to the application colo_model.add_ml_model( "cnn", "TF", @@ -358,52 +419,53 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): # Assert we have added both models assert len(colo_model._db_models) == 2 - exp.start(colo_model, block=False) - - completed = False - timeout = 90 - check_interval = 5 - while timeout and not completed: - timeout -= check_interval - time.sleep(check_interval) + # Launch and check successful completion + try: + exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - if all([stat == status.STATUS_COMPLETED for stat in statuses]): - completed = True - - if not completed: + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: exp.stop(colo_model) - assert False - @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): """Test DB Models on colocated DB (PyTorch backend)""" + # Set experiment name exp_name = "test-colocated-db-model-pytorch" - exp = Experiment(exp_name, launcher="local") - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = "lo" + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") + test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") + + # Create the SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher) - # create colocated model - colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create colocated RunSettings + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) colo_model.colocate_db( - port=wlmutils.get_test_port(), + port=test_port, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface ) - test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - + # Create and save ML model to filesystem save_torch_cnn(test_dir, "model1.pt") model_file = test_dir + "/model1.pt" + + # Add the ML model to the SmartSim Model colo_model.add_ml_model("cnn", "TORCH", model_path=model_file, @@ -413,9 +475,13 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): # Assert we have added both models assert len(colo_model._db_models) == 1 - exp.start(colo_model, block=True) - statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + # Launch and check successful completion + try: + exp.start(colo_model, block=True) + statuses = exp.get_status(colo_model) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + exp.stop(colo_model) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @@ -424,46 +490,57 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): then adding DBModel. """ + # Set experiment name exp_name = "test-colocated-db-model-ensemble" - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = "lo" + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") - # create colocated model - colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create the SmartSim Experiment + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + # Create RunSettings for colocated model + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + + # Create ensemble of two identical models colo_ensemble = exp.create_ensemble( "colocated_ens", run_settings=colo_settings, replicas=2 ) colo_ensemble.set_path(test_dir) + # Create a third model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) colo_model.colocate_db( - port=wlmutils.get_test_port(), + port=test_port, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface ) + # Create and save the ML models to the filesystem model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") - test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - + # Colocate a database with the ensemble with two ensemble members for i, entity in enumerate(colo_ensemble): entity.colocate_db( - port=wlmutils.get_test_port() + i, + port=test_port + i, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface ) - # Test that models added individually do not conflict with enemble ones + # Add ML model to each ensemble member individual to test that they + # do not conflict with models add to the Ensemble object entity.add_ml_model( "cnn2", "TF", @@ -474,7 +551,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): outputs=outputs2, ) - # Test adding a model from ensemble + # Test adding a model from Ensemble object colo_ensemble.add_ml_model( "cnn", "TF", @@ -486,15 +563,10 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): tag="test", ) - # Ensemble should add all available DBModels to new model + # Add a new model to the ensemble colo_ensemble.add_model(colo_model) - colo_model.colocate_db( - port=wlmutils.get_test_port() + len(colo_ensemble), - db_cpus=1, - limit_app_cpus=False, - debug=True, - ifname="lo", - ) + + # Add the ML model to SmartSim Model just added to the ensemble colo_model.add_ml_model( "cnn2", "TF", @@ -505,9 +577,13 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): outputs=outputs2, ) - exp.start(colo_ensemble, block=True) - statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + # Launch and check successful completion + try: + exp.start(colo_ensemble, block=True) + statuses = exp.get_status(colo_ensemble) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + exp.stop(colo_ensemble) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @@ -516,44 +592,61 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): ensemble, then colocating DB. """ + # Set experiment name exp_name = "test-colocated-db-model-ensemble-reordered" - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = "lo" + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + + # Create the SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher) - # create colocated model - colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create colocated RunSettings + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + # Create the ensemble of two identical SmartSim Model colo_ensemble = exp.create_ensemble( "colocated_ens", run_settings=colo_settings, replicas=2 ) colo_ensemble.set_path(test_dir) + # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) + # Create and save ML model to filesystem model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") - test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - # Test adding a model from ensemble colo_ensemble.add_ml_model( - "cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs + "cnn", + "TF", + model_path=model_file, + device=test_device, + devices_per_node=test_num_gpus, + inputs=inputs, + outputs=outputs ) + # Colocate a database with the first ensemble members for i, entity in enumerate(colo_ensemble): entity.colocate_db( - wlmutils.get_test_port() + i, + port = test_port + i, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface ) - # Test that models added individually do not conflict with enemble ones + # Add ML models to each ensemble member to make sure they + # do not conflict with other ML models entity.add_ml_model( "cnn2", "TF", @@ -564,15 +657,18 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): outputs=outputs2, ) - # Ensemble should add all available DBModels to new model + # Add another ensemble member colo_ensemble.add_model(colo_model) + + # Colocate a database with the new ensemble member colo_model.colocate_db( - port=wlmutils.get_test_port() + len(colo_ensemble), + port=test_port + len(colo_ensemble), db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface ) + # Add a ML model to the new ensemble member colo_model.add_ml_model( "cnn2", "TF", @@ -583,69 +679,90 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): outputs=outputs2, ) - exp.start(colo_ensemble, block=True) - statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + # Launch and check successful completion + try: + exp.start(colo_ensemble, block=True) + statuses = exp.get_status(colo_ensemble) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + exp.stop(colo_ensemble) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): """Test error when colocated db model has no file.""" + # Set experiment name exp_name = "test-colocated-db-model-error" - exp = Experiment(exp_name, launcher="local") - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = "lo" + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") - # create colocated model - colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher) + # Create colocated RunSettings + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + + # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) colo_model.colocate_db( - port=wlmutils.get_test_port(), + port=test_port, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface ) + # Get and save TF model model, inputs, outputs = create_tf_cnn() - test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() - + # Check that an error is raised because in-memory models + # are only supported for non-colocated deployments with pytest.raises(SSUnsupportedError): colo_model.add_ml_model( "cnn", "TF", model=model, device=test_device, devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs ) + # Create an ensemble with two identical replicas colo_ensemble = exp.create_ensemble( "colocated_ens", run_settings=colo_settings, replicas=2 ) colo_ensemble.set_path(test_dir) + + # Colocate a db with each ensemble member for i, entity in enumerate(colo_ensemble): entity.colocate_db( - port=wlmutils.get_test_port() + i, + port=test_port + i, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface ) + # Check that an error is raised because in-memory models + # are only supported for non-colocated deployments with pytest.raises(SSUnsupportedError): colo_ensemble.add_ml_model( "cnn", "TF", model=model, device=test_device, devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs ) - # Check errors for reverse order of DBModel addition and DB colocation - # create colocated model + # Check error is still thrown if an in-memory model is used + # with a colocated deployment. This test varies by adding + # the SmartSIm model with a colocated database to the ensemble + # after the ML model was been added to the ensemble. colo_settings2 = exp.create_run_settings( - exe=sys.executable, exe_args=sr_test_script + exe=sys.executable, exe_args=test_script ) # Reverse order of DBModel and model @@ -660,11 +777,11 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): for i, entity in enumerate(colo_ensemble2): with pytest.raises(SSUnsupportedError): entity.colocate_db( - port=wlmutils.get_test_port() + i, + port=test_port + i, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface, ) with pytest.raises(SSUnsupportedError): diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index b56f85a8f..a260fcf2b 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -31,6 +31,9 @@ from smartsim import Experiment, status from smartsim._core.utils import installed_redisai_backends from smartsim.error.errors import SSUnsupportedError +from smartsim.log import get_logger + +logger = get_logger(__name__) should_run = True @@ -47,321 +50,523 @@ def timestwo(x): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script(fileutils, wlmutils): +def test_db_script(fileutils, wlmutils, mlutils): """Test DB scripts on remote DB""" + # Set experiment name exp_name = "test-db-script" - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") - exp = Experiment(exp_name, exp_path=test_dir, launcher="local") - # create colocated model - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create the SmartSim Experiment + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Create the RunSettings + run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) smartsim_model.set_path(test_dir) - db = exp.create_database(port=wlmutils.get_test_port(), interface="lo") + # Create the SmartSim database + db = exp.create_database(port=test_port, interface=test_interface) exp.generate(db) + # Define the torch script string torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" - smartsim_model.add_script("test_script1", script_path=torch_script, device="CPU") - smartsim_model.add_script("test_script2", script=torch_script_str, device="CPU") - smartsim_model.add_function("test_func", function=timestwo, device="CPU") + # Add the script via file + smartsim_model.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus + ) + + # Add script via string + smartsim_model.add_script( + "test_script2", + script=torch_script_str, + device=test_device, + devices_per_node=test_num_gpus + ) - # Assert we have all three models + # Add script function + smartsim_model.add_function( + "test_func", + function=timestwo, + device=test_device, + devices_per_node=test_num_gpus + ) + + # Assert we have all three scripts assert len(smartsim_model._db_scripts) == 3 - exp.start(db, smartsim_model, block=True) - statuses = exp.get_status(smartsim_model) - exp.stop(db) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + # Launch and check successful completion + try: + exp.start(db, smartsim_model, block=True) + statuses = exp.get_status(smartsim_model) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + exp.stop(db) @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_ensemble(fileutils, wlmutils): +def test_db_script_ensemble(fileutils, wlmutils, mlutils): """Test DB scripts on remote DB""" + # Set experiment name exp_name = "test-db-script" - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") - exp = Experiment(exp_name, exp_path=test_dir, launcher="local") - # create colocated model - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create SmartSim Experiment + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + # Create RunSettings + run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + + # Create Ensemble with two identical models ensemble = exp.create_ensemble( "dbscript_ensemble", run_settings=run_settings, replicas=2 ) ensemble.set_path(test_dir) + # Create SmartSim model smartsim_model = exp.create_model("smartsim_model", run_settings) smartsim_model.set_path(test_dir) - db = exp.create_database(port=wlmutils.get_test_port(), interface="lo") + # Create SmartSim database + db = exp.create_database(port=test_port, interface=test_interface) exp.generate(db) + # Create the script string torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" - ensemble.add_script("test_script1", script_path=torch_script, device="CPU") + # Add script via file for the Ensemble object + ensemble.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus + ) + # Add script via string for each ensemble entity for entity in ensemble: entity.disable_key_prefixing() - entity.add_script("test_script2", script=torch_script_str, device="CPU") + entity.add_script( + "test_script2", + script=torch_script_str, + device=test_device, + devices_per_node=test_num_gpus + ) - ensemble.add_function("test_func", function=timestwo, device="CPU") + # Add script via function + ensemble.add_function( + "test_func", + function=timestwo, + device=test_device, + devices_per_node=test_num_gpus + ) - # Ensemble must add all available DBScripts to new entity + # Add an additional ensemble member and attach a script to the new member ensemble.add_model(smartsim_model) - smartsim_model.add_script("test_script2", script=torch_script_str, device="CPU") + smartsim_model.add_script( + "test_script2", + script=torch_script_str, + device=test_device, + devices_per_node=test_num_gpus + ) # Assert we have added both models to the ensemble assert len(ensemble._db_scripts) == 2 + # Assert we have added all three models to entities in ensemble assert all([len(entity._db_scripts) == 3 for entity in ensemble]) - exp.start(db, ensemble, block=True) - statuses = exp.get_status(ensemble) - exp.stop(db) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + try: + exp.start(db, ensemble, block=True) + statuses = exp.get_status(ensemble) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + exp.stop(db) @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script(fileutils, wlmutils): +def test_colocated_db_script(fileutils, wlmutils, mlutils): """Test DB Scripts on colocated DB""" + # Set the experiment name exp_name = "test-colocated-db-script" - exp = Experiment(exp_name, launcher="local") - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") - # create colocated model - colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create the SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher) + # Create RunSettings + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + + # Create model with colocated database colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) colo_model.colocate_db( - port=wlmutils.get_test_port(), + port=test_port, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface, ) + # Create string for script creation torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" - colo_model.add_script("test_script1", script_path=torch_script, device="CPU") - colo_model.add_script("test_script2", script=torch_script_str, device="CPU") + # Add script via file + colo_model.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus + ) + # Add script via string + colo_model.add_script( + "test_script2", + script=torch_script_str, + device=test_device, + devices_per_node=test_num_gpus + ) # Assert we have added both models assert len(colo_model._db_scripts) == 2 for db_script in colo_model._db_scripts: - print(db_script) + logger.debug(db_script) - exp.start(colo_model, block=True) - statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + try: + exp.start(colo_model, block=True) + statuses = exp.get_status(colo_model) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + exp.stop(colo_model) @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble(fileutils, wlmutils): +def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): """Test DB Scripts on colocated DB from ensemble, first colocating DB, then adding script. """ + # Set experiment name exp_name = "test-colocated-db-script" - exp = Experiment(exp_name, launcher="local") - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") - # create colocated model - colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher) + + # Create RunSettings + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + # Create SmartSim Ensemble with two identical models colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) colo_ensemble.set_path(test_dir) + # Create a SmartSim model colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) + # Colocate a db with each ensemble entity and add a script + # to each entity via file for i, entity in enumerate(colo_ensemble): entity.disable_key_prefixing() entity.colocate_db( - port=wlmutils.get_test_port() + i, + port=test_port + i, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface, ) - entity.add_script("test_script1", script_path=torch_script, device="CPU") + entity.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus + ) + # Colocate a db with the non-ensemble Model colo_model.colocate_db( - port=wlmutils.get_test_port() + len(colo_ensemble), + port=test_port + len(colo_ensemble), db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface, ) + # Add a script to the non-ensemble model torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + colo_ensemble.add_script( + "test_script2", + script=torch_script_str, + device=test_device, + devices_per_node=test_num_gpus + ) - colo_ensemble.add_script("test_script2", script=torch_script_str, device="CPU") - + # Add the third SmartSim model to the ensemble colo_ensemble.add_model(colo_model) - colo_model.add_script("test_script1", script_path=torch_script, device="CPU") + + # Add another script via file to the entire ensemble + colo_model.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus + ) # Assert we have added one model to the ensemble assert len(colo_ensemble._db_scripts) == 1 # Assert we have added both models to each entity assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) - exp.start(colo_ensemble, block=True) - statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + # Launch and check successful completion + try: + exp.start(colo_ensemble, block=True) + statuses = exp.get_status(colo_ensemble) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + exp.stop(colo_ensemble) @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils): +def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): """Test DB Scripts on colocated DB from ensemble, first adding the script to the ensemble, then colocating the DB""" + # Set Experiment name exp_name = "test-colocated-db-script" - exp = Experiment(exp_name, launcher="local") - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") - # create colocated model - colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher) + + # Create RunSettings + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + # Create Ensemble with two identical SmartSim Model colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) colo_ensemble.set_path(test_dir) + # Create an additional SmartSim Model entity colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) + # Add a script via string to the ensemble members torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" - colo_ensemble.add_script("test_script2", script=torch_script_str, device="CPU") + colo_ensemble.add_script( + "test_script2", + script=torch_script_str, + device=test_device, + devices_per_node=test_num_gpus + ) + # Add a colocated database to the ensemble members + # and then add a script via file for i, entity in enumerate(colo_ensemble): entity.disable_key_prefixing() entity.colocate_db( - port=wlmutils.get_test_port() + i, + port=test_port + i, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface, ) - entity.add_script("test_script1", script_path=torch_script, device="CPU") + entity.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus + ) + # Add a colocated database to the non-ensemble SmartSim Model colo_model.colocate_db( - port=wlmutils.get_test_port() + len(colo_ensemble), + port=test_port + len(colo_ensemble), db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface ) + # Add the non-ensemble SmartSim Model to the Ensemble + # and then add a script via file colo_ensemble.add_model(colo_model) - colo_model.add_script("test_script1", script_path=torch_script, device="CPU") + colo_model.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus + ) # Assert we have added one model to the ensemble assert len(colo_ensemble._db_scripts) == 1 # Assert we have added both models to each entity assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) - exp.start(colo_ensemble, block=True) - statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + # Launch and check successful completion + try: + exp.start(colo_ensemble, block=True) + statuses = exp.get_status(colo_ensemble) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + finally: + exp.stop(colo_ensemble) @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_errors(fileutils, wlmutils): +def test_db_script_errors(fileutils, wlmutils, mlutils): """Test DB Scripts error when setting a serialized function on colocated DB""" + # Set Experiment name exp_name = "test-colocated-db-script" - exp = Experiment(exp_name, launcher="local") - # get test setup + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + torch_script = fileutils.get_test_conf_path("torchscript.py") - # create colocated model - colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + # Create SmartSim experiment + exp = Experiment(exp_name, launcher=test_launcher) + # Create RunSettings + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + + # Create a SmartSim model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) colo_model.colocate_db( - port=wlmutils.get_test_port(), + port=test_port, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface, ) + # Check that an error is raised for adding in-memory + # function when using colocated deployment with pytest.raises(SSUnsupportedError): - colo_model.add_function("test_func", function=timestwo, device="CPU") - - # create colocated model - colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + colo_model.add_function( + "test_func", + function=timestwo, + device=test_device, + devices_per_node=test_num_gpus + ) + # Create ensemble with two identical SmartSim Model entities + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) colo_ensemble.set_path(test_dir) + # Add a colocated database for each ensemble member for i, entity in enumerate(colo_ensemble): entity.colocate_db( - port=wlmutils.get_test_port() + i, + port=test_port + i, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface, ) + # Check that an exception is raised when adding an in-memory + # function to the ensemble with colocated databases with pytest.raises(SSUnsupportedError): - colo_ensemble.add_function("test_func", function=timestwo, device="CPU") - - # create colocated model - colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + colo_ensemble.add_function( + "test_func", + function=timestwo, + device=test_device, + devices_per_node=test_num_gpus + ) + # Create an ensemble with two identical SmartSim Model entities + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) colo_ensemble.set_path(test_dir) - colo_ensemble.add_function("test_func", function=timestwo, device="CPU") + # Add an in-memory function to the ensemble + colo_ensemble.add_function( + "test_func", + function=timestwo, + device=test_device, + devices_per_node=test_num_gpus + ) + # Check that an error is raised when trying to add + # a colocated database to ensemble members that have + # an in-memory script for i, entity in enumerate(colo_ensemble): with pytest.raises(SSUnsupportedError): entity.colocate_db( - port=wlmutils.get_test_port() + i, + port=test_port + i, db_cpus=1, limit_app_cpus=False, debug=True, - ifname="lo", + ifname=test_interface, ) + # Check that an error is raised when trying to add + # a colocated database to an Ensemble that has + # an in-memory script with pytest.raises(SSUnsupportedError): colo_ensemble.add_model(colo_model) From 7b3b4635efb5236eba4c8417a5d8ff371812dd19 Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 16 Jun 2023 14:01:56 -0700 Subject: [PATCH 06/25] Add check for file and func parameters. --- smartsim/_core/entrypoints/colocated.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index 29b63cce4..a88c037c0 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -141,9 +141,12 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: parser.add_argument("--device", type=str) parser.add_argument("--devices_per_node", type=int) args = parser.parse_args(db_script) + + if args.file and args.func: + raise ValueError("Both file and func cannot be provided.") + if args.func: func = args.func.replace("\\n", "\n") - if args.devices_per_node > 1 and args.device.lower() == "gpu": client.set_script_multigpu(args.name, func, 0, args.devices_per_node) else: @@ -153,6 +156,8 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: client.set_script_from_file_multigpu(args.name, args.file, 0, args.devices_per_node) else: client.set_script_from_file(args.name, args.file, args.device) + else: + raise ValueError("No file or func provided.") return args.name From ecb03eaf4cb04a963db9758509f9bf3c962622ad Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 16 Jun 2023 14:24:26 -0700 Subject: [PATCH 07/25] Fix comment --- smartsim/entity/model.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index d191aa6d3..b173a6ce6 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -462,10 +462,8 @@ def add_function( :param name: key to store function under :type name: str - :param script: TorchScript code - :type script: str or byte string, optional - :param script_path: path to TorchScript code - :type script_path: str, optional + :param function: TorchScript function code + :type function: str, optional :param device: device for script execution, defaults to "CPU" :type device: str, optional :param devices_per_node: The number of GPU devices available on the host. From 87898cb3703d760bfc61b0ef87744f8531e8c3b5 Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 16 Jun 2023 16:48:55 -0700 Subject: [PATCH 08/25] Add additional debug info to the dbscript tests. --- tests/test_configs/run_dbscript_smartredis.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_configs/run_dbscript_smartredis.py b/tests/test_configs/run_dbscript_smartredis.py index 367d8d84e..c12515878 100644 --- a/tests/test_configs/run_dbscript_smartredis.py +++ b/tests/test_configs/run_dbscript_smartredis.py @@ -39,17 +39,23 @@ def main(): assert client.poll_model("test_script1", 500, 30) client.run_script("test_script1", "average", ["test_array"], ["test_output"]) returned = client.get_tensor("test_output") + print(f"Return value from test_script_1: {returned}") + print(f"Expected value from test_script1: {np.mean(array)}") assert returned == approx(np.mean(array)) assert client.poll_model("test_script2", 500, 30) client.run_script("test_script2", "negate", ["test_array"], ["test_output"]) + print(f"Return value from test_script_2: {returned}") returned = client.get_tensor("test_output") + print(f"Expected value from test_script2: {-array}") assert returned == approx(-array) if client.model_exists("test_func"): client.run_script("test_func", "timestwo", ["test_array"], ["test_output"]) returned = client.get_tensor("test_output") + print(f"Return value from test_func: {returned}") + print(f"Expected value from test_func: {2 * array}") assert returned == approx(2 * array) print(f"Test worked!") From 469819ea4f02413120cc96171638356da181493d Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 16 Jun 2023 17:27:16 -0700 Subject: [PATCH 09/25] Fix placement of debug output. --- tests/test_configs/run_dbscript_smartredis.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_configs/run_dbscript_smartredis.py b/tests/test_configs/run_dbscript_smartredis.py index c12515878..699af61e6 100644 --- a/tests/test_configs/run_dbscript_smartredis.py +++ b/tests/test_configs/run_dbscript_smartredis.py @@ -45,10 +45,9 @@ def main(): assert client.poll_model("test_script2", 500, 30) client.run_script("test_script2", "negate", ["test_array"], ["test_output"]) - print(f"Return value from test_script_2: {returned}") returned = client.get_tensor("test_output") + print(f"Return value from test_script_2: {returned}") print(f"Expected value from test_script2: {-array}") - assert returned == approx(-array) if client.model_exists("test_func"): From ca71797f8246fff91829da52840124f9a4d64fee Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 16 Jun 2023 23:47:41 -0700 Subject: [PATCH 10/25] Replace colocate_db with colocate_db_tcp. --- tests/backends/test_dbmodel.py | 18 +++++++++--------- tests/backends/test_dbscript.py | 16 ++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 54fa65a43..fe8ea6e86 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -384,7 +384,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): # Create colocated Model colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) - colo_model.colocate_db( + colo_model.colocate_db_tcp( port=test_port, db_cpus=1, limit_app_cpus=False, @@ -453,7 +453,7 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) - colo_model.colocate_db( + colo_model.colocate_db_tcp( port=test_port, db_cpus=1, limit_app_cpus=False, @@ -518,7 +518,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): # Create a third model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) - colo_model.colocate_db( + colo_model.colocate_db_tcp( port=test_port, db_cpus=1, limit_app_cpus=False, @@ -532,7 +532,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): # Colocate a database with the ensemble with two ensemble members for i, entity in enumerate(colo_ensemble): - entity.colocate_db( + entity.colocate_db_tcp( port=test_port + i, db_cpus=1, limit_app_cpus=False, @@ -638,7 +638,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): # Colocate a database with the first ensemble members for i, entity in enumerate(colo_ensemble): - entity.colocate_db( + entity.colocate_db_tcp( port = test_port + i, db_cpus=1, limit_app_cpus=False, @@ -661,7 +661,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): colo_ensemble.add_model(colo_model) # Colocate a database with the new ensemble member - colo_model.colocate_db( + colo_model.colocate_db_tcp( port=test_port + len(colo_ensemble), db_cpus=1, limit_app_cpus=False, @@ -714,7 +714,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) - colo_model.colocate_db( + colo_model.colocate_db_tcp( port=test_port, db_cpus=1, limit_app_cpus=False, @@ -741,7 +741,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): # Colocate a db with each ensemble member for i, entity in enumerate(colo_ensemble): - entity.colocate_db( + entity.colocate_db_tcp( port=test_port + i, db_cpus=1, limit_app_cpus=False, @@ -776,7 +776,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): ) for i, entity in enumerate(colo_ensemble2): with pytest.raises(SSUnsupportedError): - entity.colocate_db( + entity.colocate_db_tcp( port=test_port + i, db_cpus=1, limit_app_cpus=False, diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index a260fcf2b..7c9cf3dc0 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -234,7 +234,7 @@ def test_colocated_db_script(fileutils, wlmutils, mlutils): # Create model with colocated database colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) - colo_model.colocate_db( + colo_model.colocate_db_tcp( port=test_port, db_cpus=1, limit_app_cpus=False, @@ -313,7 +313,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): # to each entity via file for i, entity in enumerate(colo_ensemble): entity.disable_key_prefixing() - entity.colocate_db( + entity.colocate_db_tcp( port=test_port + i, db_cpus=1, limit_app_cpus=False, @@ -329,7 +329,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): ) # Colocate a db with the non-ensemble Model - colo_model.colocate_db( + colo_model.colocate_db_tcp( port=test_port + len(colo_ensemble), db_cpus=1, limit_app_cpus=False, @@ -418,7 +418,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): # and then add a script via file for i, entity in enumerate(colo_ensemble): entity.disable_key_prefixing() - entity.colocate_db( + entity.colocate_db_tcp( port=test_port + i, db_cpus=1, limit_app_cpus=False, @@ -434,7 +434,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): ) # Add a colocated database to the non-ensemble SmartSim Model - colo_model.colocate_db( + colo_model.colocate_db_tcp( port=test_port + len(colo_ensemble), db_cpus=1, limit_app_cpus=False, @@ -492,7 +492,7 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): # Create a SmartSim model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) - colo_model.colocate_db( + colo_model.colocate_db_tcp( port=test_port, db_cpus=1, limit_app_cpus=False, @@ -519,7 +519,7 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): # Add a colocated database for each ensemble member for i, entity in enumerate(colo_ensemble): - entity.colocate_db( + entity.colocate_db_tcp( port=test_port + i, db_cpus=1, limit_app_cpus=False, @@ -557,7 +557,7 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): # an in-memory script for i, entity in enumerate(colo_ensemble): with pytest.raises(SSUnsupportedError): - entity.colocate_db( + entity.colocate_db_tcp( port=test_port + i, db_cpus=1, limit_app_cpus=False, From 2083fdc4b800af4ea57f84aa596181904a610e32 Mon Sep 17 00:00:00 2001 From: Matt Ellis Date: Wed, 21 Jun 2023 14:52:04 -0500 Subject: [PATCH 11/25] Remove debug statements. --- tests/test_configs/run_dbscript_smartredis.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_configs/run_dbscript_smartredis.py b/tests/test_configs/run_dbscript_smartredis.py index 699af61e6..b486b3fcd 100644 --- a/tests/test_configs/run_dbscript_smartredis.py +++ b/tests/test_configs/run_dbscript_smartredis.py @@ -39,22 +39,16 @@ def main(): assert client.poll_model("test_script1", 500, 30) client.run_script("test_script1", "average", ["test_array"], ["test_output"]) returned = client.get_tensor("test_output") - print(f"Return value from test_script_1: {returned}") - print(f"Expected value from test_script1: {np.mean(array)}") assert returned == approx(np.mean(array)) assert client.poll_model("test_script2", 500, 30) client.run_script("test_script2", "negate", ["test_array"], ["test_output"]) returned = client.get_tensor("test_output") - print(f"Return value from test_script_2: {returned}") - print(f"Expected value from test_script2: {-array}") assert returned == approx(-array) if client.model_exists("test_func"): client.run_script("test_func", "timestwo", ["test_array"], ["test_output"]) returned = client.get_tensor("test_output") - print(f"Return value from test_func: {returned}") - print(f"Expected value from test_func: {2 * array}") assert returned == approx(2 * array) print(f"Test worked!") From 40215569839a31c18d343aa1e0c435628df0e659 Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 14:43:33 -0700 Subject: [PATCH 12/25] Fix return type of test_num_gpus --- smartsim/_core/config/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 42efb7441..06701e8f4 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -148,7 +148,7 @@ def log_level(self) -> str: @property def jm_interval(self) -> int: - return int(os.environ.get("SMARTSIM_JM_INTERVAL", 10)) + return int(os.environ.get("SMARTSIM_JM_INTERVAL") or 10) @property def wlm_trials(self) -> int: @@ -163,8 +163,8 @@ def test_device(self) -> str: # pragma: no cover return os.environ.get("SMARTSIM_TEST_DEVICE", "CPU") @property - def test_num_gpus(self) -> str: # pragma: no cover - return os.environ.get("SMARTSIM_TEST_NUM_GPUS", 1) + def test_num_gpus(self) -> int: # pragma: no cover + return int(os.environ.get("SMARTSIM_TEST_NUM_GPUS") or 1) @property def test_port(self) -> int: # pragma: no cover From fb61db3a23f8938bd3f2294f52bfe59e1acc9df0 Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 14:47:12 -0700 Subject: [PATCH 13/25] Fix at wlm_trials. --- smartsim/_core/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 06701e8f4..99db81196 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -152,7 +152,7 @@ def jm_interval(self) -> int: @property def wlm_trials(self) -> int: - return int(os.environ.get("SMARTSIM_WLM_TRIALS", 10)) + return int(os.environ.get("SMARTSIM_WLM_TRIALS") or 10) @property def test_launcher(self) -> str: # pragma: no cover From bb298a4e738965c4de6f18bd052ae98b162ce62b Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 14:48:17 -0700 Subject: [PATCH 14/25] Fix global nature of test_num_gpus --- conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/conftest.py b/conftest.py index 25a8a76a4..9b401fd5a 100644 --- a/conftest.py +++ b/conftest.py @@ -595,5 +595,4 @@ def get_test_device(): @staticmethod def get_test_num_gpus(): - global test_num_gpus return test_num_gpus From c439d4dae93652af08784da3201633e211372c6d Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 14:58:54 -0700 Subject: [PATCH 15/25] Fix type hint on colocate_db_tcp --- smartsim/entity/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index b173a6ce6..fe2ead1e1 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -221,7 +221,7 @@ def colocate_db_uds( def colocate_db_tcp( self, port: int = 6379, - ifname: str = "lo", + ifname: t.Union[str, list[str]] = "lo", db_cpus: int = 1, limit_app_cpus: bool = True, debug: bool = False, @@ -252,7 +252,7 @@ def colocate_db_tcp( :param port: port to use for orchestrator database, defaults to 6379 :type port: int, optional :param ifname: interface to use for orchestrator, defaults to "lo" - :type ifname: str, optional + :type ifname: str | list[str], optional :param db_cpus: number of cpus to use for orchestrator, defaults to 1 :type db_cpus: int, optional :param limit_app_cpus: whether to limit the number of cpus used by the app, defaults to True From 467bd92f4c6dc928109e400e2ae46a751959d31f Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 15:16:54 -0700 Subject: [PATCH 16/25] Make consistent the network interface for colocated db. --- tests/backends/test_dbmodel.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index fe8ea6e86..3705b806a 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -368,7 +368,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() - test_interface = "lo" + test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() @@ -436,7 +436,6 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() - test_interface = "lo" test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() @@ -495,7 +494,6 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() - test_interface = "lo" test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() @@ -597,7 +595,6 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() - test_interface = "lo" test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() @@ -697,7 +694,6 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() - test_interface = "lo" test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() From 133aaad88644d0af8d15d39fbba398d4b87189f2 Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 15:37:22 -0700 Subject: [PATCH 17/25] Update docs --- doc/testing.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/testing.rst b/doc/testing.rst index 249752b93..0c27c266b 100644 --- a/doc/testing.rst +++ b/doc/testing.rst @@ -105,6 +105,8 @@ are a few other runtime test configuration options for SmartSim - ``SMARTSIM_TEST_LAUNCHER``: Workload manager of the system (local by default) - ``SMARTSIM_TEST_ACCOUNT``: Project account for allocations (used for customer systems mostly) - ``SMARTSIM_TEST_DEVICE``: ``cpu`` or ``gpu`` + - ``SMARTSIM_TEST_NUM_GPUS``: the number of GPUs to use for model and script testing (defaults to 1) + - ``SMARTSIM_TEST_PORT``: the port to use for database communication - ``SMARTSIM_TEST_INTERFACE``: network interface to use. For the ``SMARTSIM_TEST_INTERFACE``, the default is ``ipogif0`` which From 73749c4fc72771d46d97828d50bd9918b9a3df56 Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 15:45:35 -0700 Subject: [PATCH 18/25] Update docs. --- doc/testing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/testing.rst b/doc/testing.rst index 0c27c266b..d7091008d 100644 --- a/doc/testing.rst +++ b/doc/testing.rst @@ -25,7 +25,7 @@ level of the SmartSim directory:: .. note:: -You must have the extra dev dependencies installed in +You must have the extra dev dependencies installed in your python environment to execute tests. Install ``dev`` dependencies with ``pip`` @@ -106,7 +106,7 @@ are a few other runtime test configuration options for SmartSim - ``SMARTSIM_TEST_ACCOUNT``: Project account for allocations (used for customer systems mostly) - ``SMARTSIM_TEST_DEVICE``: ``cpu`` or ``gpu`` - ``SMARTSIM_TEST_NUM_GPUS``: the number of GPUs to use for model and script testing (defaults to 1) - - ``SMARTSIM_TEST_PORT``: the port to use for database communication + - ``SMARTSIM_TEST_PORT``: the port to use for database communication (defaults to 6780) - ``SMARTSIM_TEST_INTERFACE``: network interface to use. For the ``SMARTSIM_TEST_INTERFACE``, the default is ``ipogif0`` which From ba3e53931f698335243a28dbacc19565ed92b5be Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 17:28:20 -0700 Subject: [PATCH 19/25] Test setting lo interface for CI/CD --- .github/workflows/run_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 91e717c09..c9c9e5288 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -121,6 +121,7 @@ jobs: - name: Run Pytest run: | echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV + echo "SMARTSIM_TEST_INTERFACE=lo" >> $GITHUB_ENV py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ ./tests/ - name: Upload Pytest coverage to Codecov From 4f3a329430300142ff66f146ae27cd7070080c2b Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 20:20:50 -0700 Subject: [PATCH 20/25] Check CI/CD env var --- conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/conftest.py b/conftest.py index 9b401fd5a..c965dcef0 100644 --- a/conftest.py +++ b/conftest.py @@ -74,6 +74,10 @@ def print_test_configuration(): global test_nic global test_alloc_specs_path global test_port + + env_var = os.environ.get("SMARTSIM_TEST_INTERFACE") + print(f"THE ENVIRONMENT VALUE OF SMARTSIM_TEST_INTERFACE is {env_var}") + print("TEST_SMARTSIM_LOCATION:", smartsim.__path__) print("TEST_PATH:", test_path) print("TEST_LAUNCHER:", test_launcher) From 0374a70f4858e1d472c39e5cea4bcf11816e8bfe Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 21:43:55 -0700 Subject: [PATCH 21/25] Fix env var test for run_tests.yml --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index c9c9e5288..8d622b768 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -64,6 +64,7 @@ jobs: env: SMARTSIM_REDISAI: ${{ matrix.rai }} + SMARTSIM_TEST_INTERFACE: lo steps: - uses: actions/checkout@v2 @@ -121,7 +122,6 @@ jobs: - name: Run Pytest run: | echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV - echo "SMARTSIM_TEST_INTERFACE=lo" >> $GITHUB_ENV py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ ./tests/ - name: Upload Pytest coverage to Codecov From 5f29fc84a74fcf9e1f9cd2b6ad84bdaee606be48 Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 22:40:31 -0700 Subject: [PATCH 22/25] Fix error in the test_dbmodel.py port numbers. --- tests/backends/test_dbmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 3705b806a..950782b56 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -531,7 +531,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): # Colocate a database with the ensemble with two ensemble members for i, entity in enumerate(colo_ensemble): entity.colocate_db_tcp( - port=test_port + i, + port=test_port + i + 1, db_cpus=1, limit_app_cpus=False, debug=True, From 4d03b6cbe52468332ff6760bf6cdfcfd62f13df3 Mon Sep 17 00:00:00 2001 From: Matthew Ellis Date: Fri, 23 Jun 2023 23:14:30 -0700 Subject: [PATCH 23/25] Remove conftest.py debug statement. --- conftest.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/conftest.py b/conftest.py index c965dcef0..1fdf0fa88 100644 --- a/conftest.py +++ b/conftest.py @@ -75,9 +75,6 @@ def print_test_configuration(): global test_alloc_specs_path global test_port - env_var = os.environ.get("SMARTSIM_TEST_INTERFACE") - print(f"THE ENVIRONMENT VALUE OF SMARTSIM_TEST_INTERFACE is {env_var}") - print("TEST_SMARTSIM_LOCATION:", smartsim.__path__) print("TEST_PATH:", test_path) print("TEST_LAUNCHER:", test_launcher) From 66ebb312a87c90c991384b5828b8a5d8ae23181b Mon Sep 17 00:00:00 2001 From: Matt Ellis Date: Wed, 28 Jun 2023 12:48:37 -0500 Subject: [PATCH 24/25] Change default testing interface to lo instead of ipogif. --- .github/workflows/run_tests.yml | 1 - smartsim/_core/config/config.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 8d622b768..91e717c09 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -64,7 +64,6 @@ jobs: env: SMARTSIM_REDISAI: ${{ matrix.rai }} - SMARTSIM_TEST_INTERFACE: lo steps: - uses: actions/checkout@v2 diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 99db81196..4c69f2943 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -188,7 +188,7 @@ def test_interface(self) -> t.List[str]: # pragma: no cover elif "ib0" in net_if_addrs: return ["ib0"] # default to aries network - return ["ipogif0"] + return ["lo"] @property def test_account(self) -> t.Optional[str]: # pragma: no cover From af388068f65bda9ff50ae9ece6a4ac1cf320b5ee Mon Sep 17 00:00:00 2001 From: Matt Ellis Date: Wed, 28 Jun 2023 12:52:46 -0500 Subject: [PATCH 25/25] Update changelog. --- doc/changelog.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index d432722c6..9110aae58 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -43,7 +43,8 @@ Detailed notes unexpectedly due to copy-by-ref (PR305_) - The underlying code for Model.add_ml_model() and Model.add_script() was fixed to correctly handle multi-GPU configurations. Tests were updated to run on - non-local launchers. Documentation was updated and fixed. (PR304_) + non-local launchers. Documentation was updated and fixed. Also, the default + testing interface has been changed to lo instead of ipogif. (PR304_) - Typehints have been added. A makefile target `make check-mypy` executes static analysis with mypy. (PR295_, PR303_) - Simplify code in `random_permutations` parameter generation strategy (PR300_)