From 02b9069ea436b89af101af1c81c8eb56eb99b4dd Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Wed, 14 Jun 2023 09:30:25 -0700
Subject: [PATCH 01/25] Fix mixing Model functions in the documentation
 summary.

---
 doc/api/smartsim_api.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst
index 4b953f89a..f6e876867 100644
--- a/doc/api/smartsim_api.rst
+++ b/doc/api/smartsim_api.rst
@@ -431,6 +431,10 @@ Model
    Model.colocate_db
    Model.colocate_db_tcp
    Model.colocate_db_uds
+   Model.colocated
+   Model.add_ml_model
+   Model.add_script
+   Model.add_function
    Model.params_to_args
    Model.register_incoming_entity
    Model.enable_key_prefixing

From 9784109e17bd278071197f426ad8a214c1ff4360 Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Wed, 14 Jun 2023 09:34:17 -0700
Subject: [PATCH 02/25] Update changelog.

---
 doc/changelog.rst | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/doc/changelog.rst b/doc/changelog.rst
index 77054b42c..d432722c6 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -25,6 +25,7 @@ Description
 
 A full list of changes and detailed notes can be found below:
 
+- Fix add_ml_model() and add_script() documentation, tests, and code
 - Remove wait time associated with Experiment launch summary
 - Update and rename Redis conf file
 - Migrate from redis-py-cluster to redis-py
@@ -38,8 +39,11 @@ A full list of changes and detailed notes can be found below:
 
 Detailed notes
 
-- Fix defect where dictionaries used to create run settings can be changed 
+- Fix defect where dictionaries used to create run settings can be changed
   unexpectedly due to copy-by-ref (PR305_)
+- The underlying code for Model.add_ml_model() and Model.add_script() was fixed
+  to correctly handle multi-GPU configurations.  Tests were updated to run on
+  non-local launchers.  Documentation was updated and fixed. (PR304_)
 - Typehints have been added. A makefile target `make check-mypy` executes static
   analysis with mypy. (PR295_, PR303_)
 - Simplify code in `random_permutations` parameter generation strategy (PR300_)
@@ -48,17 +52,18 @@ Detailed notes
 - Migrate from redis-py-cluster to redis-py for cluster status checks (PR292_)
 - Update full test suite to no longer require a tensorflow wheel to be available at test time. (PR291_)
 - Correct spelling of colocated in doc strings (PR290_)
-- Deprecated launcher-specific orchestrators, constants, and ML 
+- Deprecated launcher-specific orchestrators, constants, and ML
   utilities were removed. (PR289_)
 - Relax the coloredlogs version to be greater than 10.0 (PR288_)
 - Update the Github Actions runner image from `macos-10.15`` to `macos-12``. The
   former began deprecation in May 2022 and was finally removed in May 2023. (PR285_)
-- The Fortran tutorials had not been fully updated to show how to handle 
+- The Fortran tutorials had not been fully updated to show how to handle
   return/error codes. These have now all been updated. (PR284_)
-- Orchestrator and Colocated DB now accept a list of interfaces to bind to. The 
+- Orchestrator and Colocated DB now accept a list of interfaces to bind to. The
   argument name is still `interface` for backward compatibility reasons. (PR281_)
 
 .. _PR305: https://github.com/CrayLabs/SmartSim/pull/305
+.. _PR304: https://github.com/CrayLabs/SmartSim/pull/304
 .. _PR303: https://github.com/CrayLabs/SmartSim/pull/303
 .. _PR300: https://github.com/CrayLabs/SmartSim/pull/300
 .. _PR298: https://github.com/CrayLabs/SmartSim/pull/298

From 39241e294d7785e27af17ad2ef68b100d42f37dc Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Wed, 14 Jun 2023 15:10:31 -0700
Subject: [PATCH 03/25] Fix the functionality of setting models and scripts on
 multiple GPUs.

---
 smartsim/_core/entrypoints/colocated.py | 48 ++++++++++++-------------
 smartsim/entity/model.py                | 18 +++++++---
 2 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py
index e03449a3f..29b63cce4 100644
--- a/smartsim/_core/entrypoints/colocated.py
+++ b/smartsim/_core/entrypoints/colocated.py
@@ -92,7 +92,22 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str:
     if args.outputs:
         outputs = list(args.outputs)
 
-    if args.devices_per_node == 1:
+    # devices_per_node being greater than one only applies
+    # to GPU devices
+    if args.devices_per_node > 1 and args.device.lower() == "gpu":
+        client.set_model_from_file_multigpu(
+            args.name,
+            args.file,
+            args.backend,
+            0,
+            args.devices_per_node,
+            args.batch_size,
+            args.min_batch_size,
+            args.tag,
+            inputs,
+            outputs
+        )
+    else:
         client.set_model_from_file(
             args.name,
             args.file,
@@ -102,21 +117,8 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str:
             args.min_batch_size,
             args.tag,
             inputs,
-            outputs,
+            outputs
         )
-    else:
-        for device_num in range(args.devices_per_node):
-            client.set_model_from_file(
-                args.name,
-                args.file,
-                args.backend,
-                args.device + f":{device_num}",
-                args.batch_size,
-                args.min_batch_size,
-                args.tag,
-                inputs,
-                outputs,
-            )
 
     return args.name
 
@@ -142,19 +144,15 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str:
     if args.func:
         func = args.func.replace("\\n", "\n")
 
-        if args.devices_per_node == 1:
-            client.set_script(args.name, func, args.device)
+        if args.devices_per_node > 1 and args.device.lower() == "gpu":
+            client.set_script_multigpu(args.name, func, 0, args.devices_per_node)
         else:
-            for device_num in range(args.devices_per_node):
-                client.set_script(args.name, func, args.device + f":{device_num}")
+            client.set_script(args.name, func, args.device)
     elif args.file:
-        if args.devices_per_node == 1:
-            client.set_script_from_file(args.name, args.file, args.device)
+        if args.devices_per_node > 1 and args.device.lower() == "gpu":
+            client.set_script_from_file_multigpu(args.name, args.file, 0, args.devices_per_node)
         else:
-            for device_num in range(args.devices_per_node):
-                client.set_script_from_file(
-                    args.name, args.file, args.device + f":{device_num}"
-                )
+            client.set_script_from_file(args.name, args.file, args.device)
 
     return args.name
 
diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py
index 3729aa720..c17f6834d 100644
--- a/smartsim/entity/model.py
+++ b/smartsim/entity/model.py
@@ -75,7 +75,7 @@ def __init__(
         self._db_models: t.List[DBModel] = []
         self._db_scripts: t.List[DBScript] = []
         self.files: t.Optional[EntityFiles] = None
-        
+
     @property
     def colocated(self) -> bool:
         """Return True if this Model will run with a colocated Orchestrator"""
@@ -356,14 +356,18 @@ def add_ml_model(
 
         :param name: key to store model under
         :type name: str
+        :param backend: name of the backend (TORCH, TF, TFLITE, ONNX)
+        :type backend: str
         :param model: model in memory
         :type model: byte string, optional
         :param model_path: serialized model
         :type model_path: file path to model
-        :param backend: name of the backend (TORCH, TF, TFLITE, ONNX)
-        :type backend: str
         :param device: name of device for execution, defaults to "CPU"
         :type device: str, optional
+        :param devices_per_node: The number of GPU devices available on the host.
+               This parameter only applies to GPU devices and will be ignored if device
+               is specified as GPU.
+        :type devices_per_node: int
         :param batch_size: batch size for execution, defaults to 0
         :type batch_size: int, optional
         :param min_batch_size: minimum batch size for model execution, defaults to 0
@@ -421,7 +425,9 @@ def add_script(
         :type script_path: str, optional
         :param device: device for script execution, defaults to "CPU"
         :type device: str, optional
-        :param devices_per_node: number of devices on each host
+        :param devices_per_node: The number of GPU devices available on the host.
+               This parameter only applies to GPU devices and will be ignored if device
+               is specified as GPU.
         :type devices_per_node: int
         """
         db_script = DBScript(
@@ -462,7 +468,9 @@ def add_function(
         :type script_path: str, optional
         :param device: device for script execution, defaults to "CPU"
         :type device: str, optional
-        :param devices_per_node: number of devices on each host
+        :param devices_per_node: The number of GPU devices available on the host.
+               This parameter only applies to GPU devices and will be ignored if device
+               is specified as GPU.
         :type devices_per_node: int
         """
         db_script = DBScript(

From 1480fe26749374498aa31b5ce9ce9adfcc7bdb7a Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Thu, 15 Jun 2023 09:29:50 -0700
Subject: [PATCH 04/25] Update multi-db code and improve colocated db tests.

---
 conftest.py                     |  6 +++
 smartsim/_core/config/config.py |  8 +++
 tests/backends/test_dbmodel.py  | 96 ++++++++++++++++++++++++---------
 tests/backends/test_tf.py       |  2 +-
 4 files changed, 87 insertions(+), 25 deletions(-)

diff --git a/conftest.py b/conftest.py
index 823e7fc53..25a8a76a4 100644
--- a/conftest.py
+++ b/conftest.py
@@ -51,6 +51,7 @@
 test_dir = os.path.join(test_path, "tests", "test_output")
 test_launcher = CONFIG.test_launcher
 test_device = CONFIG.test_device
+test_num_gpus = CONFIG.test_num_gpus
 test_nic = CONFIG.test_interface
 test_alloc_specs_path = os.getenv("SMARTSIM_TEST_ALLOC_SPEC_SHEET_PATH", None)
 test_port = CONFIG.test_port
@@ -591,3 +592,8 @@ class MLUtils:
     def get_test_device():
         global test_device
         return test_device
+
+    @staticmethod
+    def get_test_num_gpus():
+        global test_num_gpus
+        return test_num_gpus
diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py
index 832958f13..42efb7441 100644
--- a/smartsim/_core/config/config.py
+++ b/smartsim/_core/config/config.py
@@ -79,6 +79,10 @@
 #  - CPU or GPU for model serving tests
 #  - Default: CPU
 #
+# SMARTSIM_TEST_NUM_GPUS
+#  - Number of GPUs on the host for testing
+#  - Defaults: 1
+#
 # SMARTSIM_TEST_ACCOUNT
 #  - Account used to run full launcher test suite on external systems
 #  - Default: None
@@ -158,6 +162,10 @@ def test_launcher(self) -> str:  # pragma: no cover
     def test_device(self) -> str:  # pragma: no cover
         return os.environ.get("SMARTSIM_TEST_DEVICE", "CPU")
 
+    @property
+    def test_num_gpus(self) -> str:  # pragma: no cover
+        return os.environ.get("SMARTSIM_TEST_NUM_GPUS", 1)
+
     @property
     def test_port(self) -> int:  # pragma: no cover
         return int(os.environ.get("SMARTSIM_TEST_PORT", 6780))
diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py
index b0e393db8..15e4ded98 100644
--- a/tests/backends/test_dbmodel.py
+++ b/tests/backends/test_dbmodel.py
@@ -131,7 +131,7 @@ def save_torch_cnn(path, file_name):
 
 
 @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
-def test_tf_db_model(fileutils, wlmutils):
+def test_tf_db_model(fileutils, wlmutils, mlutils):
     """Test TensorFlow DB Models on remote DB"""
 
     exp_name = "test-tf-db-model"
@@ -153,11 +153,15 @@ def test_tf_db_model(fileutils, wlmutils):
     model, inputs, outputs = create_tf_cnn()
     model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb")
 
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
+
     smartsim_model.add_ml_model(
         "cnn",
         "TF",
         model=model,
-        device="CPU",
+        device=test_device,
+        devices_per_node=test_num_gpus,
         inputs=inputs,
         outputs=outputs,
         tag="test",
@@ -166,7 +170,8 @@ def test_tf_db_model(fileutils, wlmutils):
         "cnn2",
         "TF",
         model_path=model_file2,
-        device="CPU",
+        device=test_device,
+        devices_per_node=test_num_gpus,
         inputs=inputs2,
         outputs=outputs2,
         tag="test",
@@ -185,7 +190,7 @@ def test_tf_db_model(fileutils, wlmutils):
 
 
 @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run")
-def test_pt_db_model(fileutils, wlmutils):
+def test_pt_db_model(fileutils, wlmutils, mlutils):
     """Test PyTorch DB Models on remote DB"""
 
     exp_name = "test-pt-db-model"
@@ -207,11 +212,15 @@ def test_pt_db_model(fileutils, wlmutils):
     save_torch_cnn(test_dir, "model1.pt")
     model_path = test_dir + "/model1.pt"
 
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
+
     smartsim_model.add_ml_model(
         "cnn",
         "TORCH",
         model_path=model_path,
-        device="CPU",
+        device=test_device,
+        devices_per_node=test_num_gpus,
         tag="test",
     )
 
@@ -228,7 +237,7 @@ def test_pt_db_model(fileutils, wlmutils):
 
 
 @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
-def test_db_model_ensemble(fileutils, wlmutils):
+def test_db_model_ensemble(fileutils, wlmutils, mlutils):
     """Test DBModels on remote DB, with an ensemble"""
 
     exp_name = "test-db-model-ensemble"
@@ -259,13 +268,17 @@ def test_db_model_ensemble(fileutils, wlmutils):
         "cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs
     )
 
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
+
     for entity in smartsim_ensemble:
         entity.disable_key_prefixing()
         entity.add_ml_model(
             "cnn2",
             "TF",
             model_path=model_file2,
-            device="CPU",
+            device=test_device,
+            devices_per_node=test_num_gpus,
             inputs=inputs2,
             outputs=outputs2,
         )
@@ -276,7 +289,8 @@ def test_db_model_ensemble(fileutils, wlmutils):
         "cnn2",
         "TF",
         model_path=model_file2,
-        device="CPU",
+        device=test_device,
+        devices_per_node=test_num_gpus,
         inputs=inputs2,
         outputs=outputs2,
     )
@@ -293,7 +307,7 @@ def test_db_model_ensemble(fileutils, wlmutils):
 
 
 @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
-def test_colocated_db_model_tf(fileutils, wlmutils):
+def test_colocated_db_model_tf(fileutils, wlmutils, mlutils):
     """Test DB Models on colocated DB (TensorFlow backend)"""
 
     exp_name = "test-colocated-db-model-tf"
@@ -319,14 +333,24 @@ def test_colocated_db_model_tf(fileutils, wlmutils):
     model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb")
     model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb")
 
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
+
     colo_model.add_ml_model(
-        "cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs
+        "cnn",
+        "TF",
+        model_path=model_file,
+        device=test_device,
+        devices_per_node=test_num_gpus,
+        inputs=inputs,
+        outputs=outputs
     )
     colo_model.add_ml_model(
         "cnn2",
         "TF",
         model_path=model_file2,
-        device="CPU",
+        device=test_device,
+        devices_per_node=test_num_gpus,
         inputs=inputs2,
         outputs=outputs2,
     )
@@ -352,7 +376,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils):
 
 
 @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run")
-def test_colocated_db_model_pytorch(fileutils, wlmutils):
+def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils):
     """Test DB Models on colocated DB (PyTorch backend)"""
 
     exp_name = "test-colocated-db-model-pytorch"
@@ -375,9 +399,16 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils):
         ifname="lo",
     )
 
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
+
     save_torch_cnn(test_dir, "model1.pt")
     model_file = test_dir + "/model1.pt"
-    colo_model.add_ml_model("cnn", "TORCH", model_path=model_file, device="CPU")
+    colo_model.add_ml_model("cnn",
+                            "TORCH",
+                            model_path=model_file,
+                            device=test_device,
+                            devices_per_node=test_num_gpus)
 
     # Assert we have added both models
     assert len(colo_model._db_models) == 1
@@ -388,7 +419,7 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils):
 
 
 @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
-def test_colocated_db_model_ensemble(fileutils, wlmutils):
+def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils):
     """Test DBModel on colocated ensembles, first colocating DB,
     then adding DBModel.
     """
@@ -421,6 +452,9 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils):
     model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb")
     model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb")
 
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
+
     for i, entity in enumerate(colo_ensemble):
         entity.colocate_db(
             port=wlmutils.get_test_port() + i,
@@ -434,7 +468,8 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils):
             "cnn2",
             "TF",
             model_path=model_file2,
-            device="CPU",
+            device=test_device,
+            devices_per_node=test_num_gpus,
             inputs=inputs2,
             outputs=outputs2,
         )
@@ -444,7 +479,8 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils):
         "cnn",
         "TF",
         model_path=model_file,
-        device="CPU",
+        device=test_device,
+        devices_per_node=test_num_gpus,
         inputs=inputs,
         outputs=outputs,
         tag="test",
@@ -463,7 +499,8 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils):
         "cnn2",
         "TF",
         model_path=model_file2,
-        device="CPU",
+        device=test_device,
+        devices_per_node=test_num_gpus,
         inputs=inputs2,
         outputs=outputs2,
     )
@@ -474,7 +511,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils):
 
 
 @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
-def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils):
+def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils):
     """Test DBModel on colocated ensembles, first adding the DBModel to the
     ensemble, then colocating DB.
     """
@@ -500,6 +537,9 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils):
     model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb")
     model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb")
 
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
+
     # Test adding a model from ensemble
     colo_ensemble.add_ml_model(
         "cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs
@@ -518,7 +558,8 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils):
             "cnn2",
             "TF",
             model_path=model_file2,
-            device="CPU",
+            device=test_device,
+            devices_per_node=test_num_gpus,
             inputs=inputs2,
             outputs=outputs2,
         )
@@ -536,7 +577,8 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils):
         "cnn2",
         "TF",
         model_path=model_file2,
-        device="CPU",
+        device=test_device,
+        devices_per_node=test_num_gpus,
         inputs=inputs2,
         outputs=outputs2,
     )
@@ -547,7 +589,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils):
 
 
 @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
-def test_colocated_db_model_errors(fileutils, wlmutils):
+def test_colocated_db_model_errors(fileutils, wlmutils, mlutils):
     """Test error when colocated db model has no file."""
 
     exp_name = "test-colocated-db-model-error"
@@ -572,9 +614,13 @@ def test_colocated_db_model_errors(fileutils, wlmutils):
 
     model, inputs, outputs = create_tf_cnn()
 
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
+
     with pytest.raises(SSUnsupportedError):
         colo_model.add_ml_model(
-            "cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs
+            "cnn", "TF", model=model, device=test_device,
+            devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs
         )
 
     colo_ensemble = exp.create_ensemble(
@@ -592,7 +638,8 @@ def test_colocated_db_model_errors(fileutils, wlmutils):
 
     with pytest.raises(SSUnsupportedError):
         colo_ensemble.add_ml_model(
-            "cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs
+            "cnn", "TF", model=model, device=test_device,
+            devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs
         )
 
     # Check errors for reverse order of DBModel addition and DB colocation
@@ -607,7 +654,8 @@ def test_colocated_db_model_errors(fileutils, wlmutils):
     )
     colo_ensemble2.set_path(test_dir)
     colo_ensemble2.add_ml_model(
-        "cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs
+        "cnn", "TF", model=model, device=test_device,
+            devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs
     )
     for i, entity in enumerate(colo_ensemble2):
         with pytest.raises(SSUnsupportedError):
diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py
index 2bf327b16..e30ad4f24 100644
--- a/tests/backends/test_tf.py
+++ b/tests/backends/test_tf.py
@@ -54,7 +54,7 @@ def test_keras_model(fileutils, mlutils, wlmutils):
     """This test needs two free nodes, 1 for the db and 1 for a keras model script
 
     this test can run on CPU/GPU by setting SMARTSIM_TEST_DEVICE=GPU
-    Similarly, the test can excute on any launcher by setting SMARTSIM_TEST_LAUNCHER
+    Similarly, the test can execute on any launcher by setting SMARTSIM_TEST_LAUNCHER
     which is local by default.
 
     You may need to put CUDNN in your LD_LIBRARY_PATH if running on GPU

From 186cb10ce4b98a8027bcf9fe57611cbe02affeed Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Thu, 15 Jun 2023 23:18:55 -0700
Subject: [PATCH 05/25] Fix DBModel and DBScript tests to be able to run on WLM
 tests with GPUs.

---
 smartsim/entity/model.py        |   4 +-
 tests/backends/test_dbmodel.py  | 397 ++++++++++++++++++++-----------
 tests/backends/test_dbscript.py | 403 ++++++++++++++++++++++++--------
 3 files changed, 563 insertions(+), 241 deletions(-)

diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py
index c17f6834d..d191aa6d3 100644
--- a/smartsim/entity/model.py
+++ b/smartsim/entity/model.py
@@ -358,7 +358,7 @@ def add_ml_model(
         :type name: str
         :param backend: name of the backend (TORCH, TF, TFLITE, ONNX)
         :type backend: str
-        :param model: model in memory
+        :param model: A model in memory (only supported for non-colocated orchestrators)
         :type model: byte string, optional
         :param model_path: serialized model
         :type model_path: file path to model
@@ -419,7 +419,7 @@ def add_script(
 
         :param name: key to store script under
         :type name: str
-        :param script: TorchScript code
+        :param script: TorchScript code (only supported for non-colocated orchestrators)
         :type script: str, optional
         :param script_path: path to TorchScript code
         :type script_path: str, optional
diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py
index 15e4ded98..54fa65a43 100644
--- a/tests/backends/test_dbmodel.py
+++ b/tests/backends/test_dbmodel.py
@@ -34,6 +34,9 @@
 from smartsim import Experiment, status
 from smartsim._core.utils import installed_redisai_backends
 from smartsim.error.errors import SSUnsupportedError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
 
 should_run_tf = True
 should_run_pt = True
@@ -134,28 +137,37 @@ def save_torch_cnn(path, file_name):
 def test_tf_db_model(fileutils, wlmutils, mlutils):
     """Test TensorFlow DB Models on remote DB"""
 
+    # Set experiment name
     exp_name = "test-tf-db-model"
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
 
-    exp = Experiment(exp_name, exp_path=test_dir, launcher="local")
-    # create colocated model
-    run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create the SmartSim Experiment
+    exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher)
 
+    # Create RunSettings
+    run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
+
+    # Create Model
     smartsim_model = exp.create_model("smartsim_model", run_settings)
     smartsim_model.set_path(test_dir)
 
-    db = exp.create_database(port=wlmutils.get_test_port(), interface="lo")
+    # Create database
+    db = exp.create_database(port=test_port, interface=test_interface)
     exp.generate(db)
 
+    # Create and save ML model to filesystem
     model, inputs, outputs = create_tf_cnn()
     model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb")
 
-    test_device = mlutils.get_test_device()
-    test_num_gpus = mlutils.get_test_num_gpus()
-
+    # Add ML model to the SmartSim model
     smartsim_model.add_ml_model(
         "cnn",
         "TF",
@@ -177,44 +189,57 @@ def test_tf_db_model(fileutils, wlmutils, mlutils):
         tag="test",
     )
 
+    logger.debug("The following ML models have been added:")
     for db_model in smartsim_model._db_models:
-        print(db_model)
+        logger.debug(db_model)
 
     # Assert we have added both models
     assert len(smartsim_model._db_models) == 2
 
-    exp.start(db, smartsim_model, block=True)
-    statuses = exp.get_status(smartsim_model)
-    exp.stop(db)
-    assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    # Launch and check successful completion
+    try:
+        exp.start(db, smartsim_model, block=True)
+        statuses = exp.get_status(smartsim_model)
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
+        exp.stop(db)
 
 
 @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run")
 def test_pt_db_model(fileutils, wlmutils, mlutils):
     """Test PyTorch DB Models on remote DB"""
 
+    # Set experiment name
     exp_name = "test-pt-db-model"
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py")
+
+    # Create the SmartSim Experiment
+    exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher)
 
-    exp = Experiment(exp_name, exp_path=test_dir, launcher="local")
-    # create colocated model
-    run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create RunSettings
+    run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
 
+    # Create Model
     smartsim_model = exp.create_model("smartsim_model", run_settings)
     smartsim_model.set_path(test_dir)
 
-    db = exp.create_database(port=wlmutils.get_test_port(), interface="lo")
+    # Create database
+    db = exp.create_database(port=test_port, interface=test_interface)
     exp.generate(db)
 
+    # Create and save ML model to filesystem
     save_torch_cnn(test_dir, "model1.pt")
     model_path = test_dir + "/model1.pt"
 
-    test_device = mlutils.get_test_device()
-    test_num_gpus = mlutils.get_test_num_gpus()
-
+    # Add ML model to the SmartSim model
     smartsim_model.add_ml_model(
         "cnn",
         "TORCH",
@@ -224,53 +249,75 @@ def test_pt_db_model(fileutils, wlmutils, mlutils):
         tag="test",
     )
 
+    logger.debug("The following ML models have been added:")
     for db_model in smartsim_model._db_models:
-        print(db_model)
+        logger.debug(db_model)
+
 
     # Assert we have added both models
     assert len(smartsim_model._db_models) == 1
 
-    exp.start(db, smartsim_model, block=True)
-    statuses = exp.get_status(smartsim_model)
-    exp.stop(db)
-    assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    # Launch and check successful completion
+    try:
+        exp.start(db, smartsim_model, block=True)
+        statuses = exp.get_status(smartsim_model)
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
+        exp.stop(db)
 
 
 @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
 def test_db_model_ensemble(fileutils, wlmutils, mlutils):
     """Test DBModels on remote DB, with an ensemble"""
 
+    # Set experiment name
     exp_name = "test-db-model-ensemble"
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
+
+    # Create the SmartSim Experiment
+    exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher)
 
-    exp = Experiment(exp_name, exp_path=test_dir, launcher="local")
-    # create colocated model
-    run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create RunSettings
+    run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
 
+    # Create ensemble
     smartsim_ensemble = exp.create_ensemble(
         "smartsim_model", run_settings=run_settings, replicas=2
     )
     smartsim_ensemble.set_path(test_dir)
 
+    # Create Model
     smartsim_model = exp.create_model("smartsim_model", run_settings)
     smartsim_model.set_path(test_dir)
 
-    db = exp.create_database(port=wlmutils.get_test_port(), interface="lo")
+    # Create database
+    db = exp.create_database(port=test_port, interface=test_interface)
     exp.generate(db)
 
+    # Create and save ML model to filesystem
     model, inputs, outputs = create_tf_cnn()
     model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb")
 
+    # Add the first ML model to all of the ensemble members
     smartsim_ensemble.add_ml_model(
-        "cnn", "TF", model=model, device="CPU", inputs=inputs, outputs=outputs
+        "cnn",
+        "TF",
+        model=model,
+        device=test_device,
+        devices_per_node=test_num_gpus,
+        inputs=inputs,
+        outputs=outputs
     )
 
-    test_device = mlutils.get_test_device()
-    test_num_gpus = mlutils.get_test_num_gpus()
-
+    # Add the second ML model individually to each SmartSim model
     for entity in smartsim_ensemble:
         entity.disable_key_prefixing()
         entity.add_ml_model(
@@ -283,8 +330,11 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils):
             outputs=outputs2,
         )
 
-    # Ensemble must add all available DBModels to new entity
+    # Add new ensemble member
     smartsim_ensemble.add_model(smartsim_model)
+
+    # Add the second ML model to the newly added entity.  This is
+    # because the test script run both ML models for all entities.
     smartsim_model.add_ml_model(
         "cnn2",
         "TF",
@@ -300,42 +350,53 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils):
     # Assert we have added two models to each entity
     assert all([len(entity._db_models) == 2 for entity in smartsim_ensemble])
 
-    exp.start(db, smartsim_ensemble, block=True)
-    statuses = exp.get_status(smartsim_ensemble)
-    exp.stop(db)
-    assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    # Launch and check successful completion
+    try:
+        exp.start(db, smartsim_ensemble, block=True)
+        statuses = exp.get_status(smartsim_ensemble)
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
+        exp.stop(db)
 
 
 @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
 def test_colocated_db_model_tf(fileutils, wlmutils, mlutils):
     """Test DB Models on colocated DB (TensorFlow backend)"""
 
+    # Set experiment name
     exp_name = "test-colocated-db-model-tf"
-    exp = Experiment(exp_name, launcher="local")
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = "lo"
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
 
-    # create colocated model
-    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create SmartSim Experience
+    exp = Experiment(exp_name, launcher=test_launcher)
 
+    # Create RunSettings
+    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
+
+    # Create colocated Model
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
     colo_model.colocate_db(
-        port=wlmutils.get_test_port(),
+        port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
         debug=True,
-        ifname="lo",
+        ifname=test_interface
     )
 
+    # Create and save ML model to filesystem
     model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb")
     model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb")
 
-    test_device = mlutils.get_test_device()
-    test_num_gpus = mlutils.get_test_num_gpus()
-
+    # Add ML models to the application
     colo_model.add_ml_model(
         "cnn",
         "TF",
@@ -358,52 +419,53 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils):
     # Assert we have added both models
     assert len(colo_model._db_models) == 2
 
-    exp.start(colo_model, block=False)
-
-    completed = False
-    timeout = 90
-    check_interval = 5
-    while timeout and not completed:
-        timeout -= check_interval
-        time.sleep(check_interval)
+    # Launch and check successful completion
+    try:
+        exp.start(colo_model, block=True)
         statuses = exp.get_status(colo_model)
-        if all([stat == status.STATUS_COMPLETED for stat in statuses]):
-            completed = True
-
-    if not completed:
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
         exp.stop(colo_model)
-        assert False
-
 
 @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run")
 def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils):
     """Test DB Models on colocated DB (PyTorch backend)"""
 
+    # Set experiment name
     exp_name = "test-colocated-db-model-pytorch"
-    exp = Experiment(exp_name, launcher="local")
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = "lo"
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py")
+
+    # Create the SmartSim Experiment
+    exp = Experiment(exp_name, launcher=test_launcher)
 
-    # create colocated model
-    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create colocated RunSettings
+    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
 
+    # Create colocated SmartSim Model
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
     colo_model.colocate_db(
-        port=wlmutils.get_test_port(),
+        port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
         debug=True,
-        ifname="lo",
+        ifname=test_interface
     )
 
-    test_device = mlutils.get_test_device()
-    test_num_gpus = mlutils.get_test_num_gpus()
-
+    # Create and save ML model to filesystem
     save_torch_cnn(test_dir, "model1.pt")
     model_file = test_dir + "/model1.pt"
+
+    # Add the ML model to the SmartSim Model
     colo_model.add_ml_model("cnn",
                             "TORCH",
                             model_path=model_file,
@@ -413,9 +475,13 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils):
     # Assert we have added both models
     assert len(colo_model._db_models) == 1
 
-    exp.start(colo_model, block=True)
-    statuses = exp.get_status(colo_model)
-    assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    # Launch and check successful completion
+    try:
+        exp.start(colo_model, block=True)
+        statuses = exp.get_status(colo_model)
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
+        exp.stop(colo_model)
 
 
 @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
@@ -424,46 +490,57 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils):
     then adding DBModel.
     """
 
+    # Set experiment name
     exp_name = "test-colocated-db-model-ensemble"
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = "lo"
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    exp = Experiment(exp_name, launcher="local", exp_path=test_dir)
-    sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
 
-    # create colocated model
-    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create the SmartSim Experiment
+    exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher)
 
+    # Create RunSettings for colocated model
+    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
+
+    # Create ensemble of two identical models
     colo_ensemble = exp.create_ensemble(
         "colocated_ens", run_settings=colo_settings, replicas=2
     )
     colo_ensemble.set_path(test_dir)
 
+    # Create a third model with a colocated database
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
     colo_model.colocate_db(
-        port=wlmutils.get_test_port(),
+        port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
         debug=True,
-        ifname="lo",
+        ifname=test_interface
     )
 
+    # Create and save the ML models to the filesystem
     model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb")
     model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb")
 
-    test_device = mlutils.get_test_device()
-    test_num_gpus = mlutils.get_test_num_gpus()
-
+    # Colocate a database with the ensemble with two ensemble members
     for i, entity in enumerate(colo_ensemble):
         entity.colocate_db(
-            port=wlmutils.get_test_port() + i,
+            port=test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
             debug=True,
-            ifname="lo",
+            ifname=test_interface
         )
-        # Test that models added individually do not conflict with enemble ones
+        # Add ML model to each ensemble member individual to test that they
+        # do not conflict with models add to the Ensemble object
         entity.add_ml_model(
             "cnn2",
             "TF",
@@ -474,7 +551,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils):
             outputs=outputs2,
         )
 
-    # Test adding a model from ensemble
+    # Test adding a model from Ensemble object
     colo_ensemble.add_ml_model(
         "cnn",
         "TF",
@@ -486,15 +563,10 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils):
         tag="test",
     )
 
-    # Ensemble should add all available DBModels to new model
+    # Add a new model to the ensemble
     colo_ensemble.add_model(colo_model)
-    colo_model.colocate_db(
-        port=wlmutils.get_test_port() + len(colo_ensemble),
-        db_cpus=1,
-        limit_app_cpus=False,
-        debug=True,
-        ifname="lo",
-    )
+
+    # Add the ML model to SmartSim Model just added to the ensemble
     colo_model.add_ml_model(
         "cnn2",
         "TF",
@@ -505,9 +577,13 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils):
         outputs=outputs2,
     )
 
-    exp.start(colo_ensemble, block=True)
-    statuses = exp.get_status(colo_ensemble)
-    assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    # Launch and check successful completion
+    try:
+        exp.start(colo_ensemble, block=True)
+        statuses = exp.get_status(colo_ensemble)
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
+        exp.stop(colo_ensemble)
 
 
 @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
@@ -516,44 +592,61 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils):
     ensemble, then colocating DB.
     """
 
+    # Set experiment name
     exp_name = "test-colocated-db-model-ensemble-reordered"
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = "lo"
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    exp = Experiment(exp_name, launcher="local", exp_path=test_dir)
-    sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
+
+    # Create the SmartSim Experiment
+    exp = Experiment(exp_name, launcher=test_launcher)
 
-    # create colocated model
-    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create colocated RunSettings
+    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
 
+    # Create the ensemble of two identical SmartSim Model
     colo_ensemble = exp.create_ensemble(
         "colocated_ens", run_settings=colo_settings, replicas=2
     )
     colo_ensemble.set_path(test_dir)
 
+    # Create colocated SmartSim Model
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
 
+    # Create and save ML model to filesystem
     model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb")
     model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb")
 
-    test_device = mlutils.get_test_device()
-    test_num_gpus = mlutils.get_test_num_gpus()
-
     # Test adding a model from ensemble
     colo_ensemble.add_ml_model(
-        "cnn", "TF", model_path=model_file, device="CPU", inputs=inputs, outputs=outputs
+        "cnn",
+        "TF",
+        model_path=model_file,
+        device=test_device,
+        devices_per_node=test_num_gpus,
+        inputs=inputs,
+        outputs=outputs
     )
 
+    # Colocate a database with the first ensemble members
     for i, entity in enumerate(colo_ensemble):
         entity.colocate_db(
-            wlmutils.get_test_port() + i,
+            port = test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
             debug=True,
-            ifname="lo",
+            ifname=test_interface
         )
-        # Test that models added individually do not conflict with enemble ones
+        # Add ML models to each ensemble member to make sure they
+        # do not conflict with other ML models
         entity.add_ml_model(
             "cnn2",
             "TF",
@@ -564,15 +657,18 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils):
             outputs=outputs2,
         )
 
-    # Ensemble should add all available DBModels to new model
+    # Add another ensemble member
     colo_ensemble.add_model(colo_model)
+
+    # Colocate a database with the new ensemble member
     colo_model.colocate_db(
-        port=wlmutils.get_test_port() + len(colo_ensemble),
+        port=test_port + len(colo_ensemble),
         db_cpus=1,
         limit_app_cpus=False,
         debug=True,
-        ifname="lo",
+        ifname=test_interface
     )
+    # Add a ML model to the new ensemble member
     colo_model.add_ml_model(
         "cnn2",
         "TF",
@@ -583,69 +679,90 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils):
         outputs=outputs2,
     )
 
-    exp.start(colo_ensemble, block=True)
-    statuses = exp.get_status(colo_ensemble)
-    assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    # Launch and check successful completion
+    try:
+        exp.start(colo_ensemble, block=True)
+        statuses = exp.get_status(colo_ensemble)
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
+        exp.stop(colo_ensemble)
 
 
 @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
 def test_colocated_db_model_errors(fileutils, wlmutils, mlutils):
     """Test error when colocated db model has no file."""
 
+    # Set experiment name
     exp_name = "test-colocated-db-model-error"
-    exp = Experiment(exp_name, launcher="local")
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = "lo"
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")
 
-    # create colocated model
-    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create SmartSim Experiment
+    exp = Experiment(exp_name, launcher=test_launcher)
 
+    # Create colocated RunSettings
+    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
+
+    # Create colocated SmartSim Model
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
     colo_model.colocate_db(
-        port=wlmutils.get_test_port(),
+        port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
         debug=True,
-        ifname="lo",
+        ifname=test_interface
     )
 
+    # Get and save TF model
     model, inputs, outputs = create_tf_cnn()
 
-    test_device = mlutils.get_test_device()
-    test_num_gpus = mlutils.get_test_num_gpus()
-
+    # Check that an error is raised because in-memory models
+    # are only supported for non-colocated deployments
     with pytest.raises(SSUnsupportedError):
         colo_model.add_ml_model(
             "cnn", "TF", model=model, device=test_device,
             devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs
         )
 
+    # Create an ensemble with two identical replicas
     colo_ensemble = exp.create_ensemble(
         "colocated_ens", run_settings=colo_settings, replicas=2
     )
     colo_ensemble.set_path(test_dir)
+
+    # Colocate a db with each ensemble member
     for i, entity in enumerate(colo_ensemble):
         entity.colocate_db(
-            port=wlmutils.get_test_port() + i,
+            port=test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
             debug=True,
-            ifname="lo",
+            ifname=test_interface
         )
 
+    # Check that an error is raised because in-memory models
+    # are only supported for non-colocated deployments
     with pytest.raises(SSUnsupportedError):
         colo_ensemble.add_ml_model(
             "cnn", "TF", model=model, device=test_device,
             devices_per_node=test_num_gpus, inputs=inputs, outputs=outputs
         )
 
-    # Check errors for reverse order of DBModel addition and DB colocation
-    # create colocated model
+    # Check error is still thrown if an in-memory model is used
+    # with a colocated deployment.  This test varies by adding
+    # the SmartSIm model with a colocated database to the ensemble
+    # after the ML model was been added to the ensemble.
     colo_settings2 = exp.create_run_settings(
-        exe=sys.executable, exe_args=sr_test_script
+        exe=sys.executable, exe_args=test_script
     )
 
     # Reverse order of DBModel and model
@@ -660,11 +777,11 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils):
     for i, entity in enumerate(colo_ensemble2):
         with pytest.raises(SSUnsupportedError):
             entity.colocate_db(
-                port=wlmutils.get_test_port() + i,
+                port=test_port + i,
                 db_cpus=1,
                 limit_app_cpus=False,
                 debug=True,
-                ifname="lo",
+                ifname=test_interface,
             )
 
     with pytest.raises(SSUnsupportedError):
diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py
index b56f85a8f..a260fcf2b 100644
--- a/tests/backends/test_dbscript.py
+++ b/tests/backends/test_dbscript.py
@@ -31,6 +31,9 @@
 from smartsim import Experiment, status
 from smartsim._core.utils import installed_redisai_backends
 from smartsim.error.errors import SSUnsupportedError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
 
 should_run = True
 
@@ -47,321 +50,523 @@ def timestwo(x):
 
 
 @pytest.mark.skipif(not should_run, reason="Test needs Torch to run")
-def test_db_script(fileutils, wlmutils):
+def test_db_script(fileutils, wlmutils, mlutils):
     """Test DB scripts on remote DB"""
 
+    # Set experiment name
     exp_name = "test-db-script"
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
     torch_script = fileutils.get_test_conf_path("torchscript.py")
 
-    exp = Experiment(exp_name, exp_path=test_dir, launcher="local")
-    # create colocated model
-    run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create the SmartSim Experiment
+    exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher)
+
+    # Create the RunSettings
+    run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
 
+    # Create the SmartSim Model
     smartsim_model = exp.create_model("smartsim_model", run_settings)
     smartsim_model.set_path(test_dir)
 
-    db = exp.create_database(port=wlmutils.get_test_port(), interface="lo")
+    # Create the SmartSim database
+    db = exp.create_database(port=test_port, interface=test_interface)
     exp.generate(db)
 
+    # Define the torch script string
     torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n"
 
-    smartsim_model.add_script("test_script1", script_path=torch_script, device="CPU")
-    smartsim_model.add_script("test_script2", script=torch_script_str, device="CPU")
-    smartsim_model.add_function("test_func", function=timestwo, device="CPU")
+    # Add the script via file
+    smartsim_model.add_script(
+        "test_script1",
+        script_path=torch_script,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
+
+    # Add script via string
+    smartsim_model.add_script(
+        "test_script2",
+        script=torch_script_str,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
 
-    # Assert we have all three models
+    # Add script function
+    smartsim_model.add_function(
+        "test_func",
+        function=timestwo,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
+
+    # Assert we have all three scripts
     assert len(smartsim_model._db_scripts) == 3
 
-    exp.start(db, smartsim_model, block=True)
-    statuses = exp.get_status(smartsim_model)
-    exp.stop(db)
-    assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    # Launch and check successful completion
+    try:
+        exp.start(db, smartsim_model, block=True)
+        statuses = exp.get_status(smartsim_model)
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
+        exp.stop(db)
 
 
 @pytest.mark.skipif(not should_run, reason="Test needs Torch to run")
-def test_db_script_ensemble(fileutils, wlmutils):
+def test_db_script_ensemble(fileutils, wlmutils, mlutils):
     """Test DB scripts on remote DB"""
 
+    # Set experiment name
     exp_name = "test-db-script"
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
     torch_script = fileutils.get_test_conf_path("torchscript.py")
 
-    exp = Experiment(exp_name, exp_path=test_dir, launcher="local")
-    # create colocated model
-    run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create SmartSim Experiment
+    exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher)
 
+    # Create RunSettings
+    run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
+
+    # Create Ensemble with two identical models
     ensemble = exp.create_ensemble(
         "dbscript_ensemble", run_settings=run_settings, replicas=2
     )
     ensemble.set_path(test_dir)
 
+    # Create SmartSim model
     smartsim_model = exp.create_model("smartsim_model", run_settings)
     smartsim_model.set_path(test_dir)
 
-    db = exp.create_database(port=wlmutils.get_test_port(), interface="lo")
+    # Create SmartSim database
+    db = exp.create_database(port=test_port, interface=test_interface)
     exp.generate(db)
 
+    # Create the script string
     torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n"
 
-    ensemble.add_script("test_script1", script_path=torch_script, device="CPU")
+    # Add script via file for the Ensemble object
+    ensemble.add_script(
+        "test_script1",
+        script_path=torch_script,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
 
+    # Add script via string for each ensemble entity
     for entity in ensemble:
         entity.disable_key_prefixing()
-        entity.add_script("test_script2", script=torch_script_str, device="CPU")
+        entity.add_script(
+            "test_script2",
+            script=torch_script_str,
+            device=test_device,
+            devices_per_node=test_num_gpus
+        )
 
-    ensemble.add_function("test_func", function=timestwo, device="CPU")
+    # Add script via function
+    ensemble.add_function(
+        "test_func",
+        function=timestwo,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
 
-    # Ensemble must add all available DBScripts to new entity
+    # Add an additional ensemble member and attach a script to the new member
     ensemble.add_model(smartsim_model)
-    smartsim_model.add_script("test_script2", script=torch_script_str, device="CPU")
+    smartsim_model.add_script(
+        "test_script2",
+        script=torch_script_str,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
 
     # Assert we have added both models to the ensemble
     assert len(ensemble._db_scripts) == 2
+
     # Assert we have added all three models to entities in ensemble
     assert all([len(entity._db_scripts) == 3 for entity in ensemble])
 
-    exp.start(db, ensemble, block=True)
-    statuses = exp.get_status(ensemble)
-    exp.stop(db)
-    assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    try:
+        exp.start(db, ensemble, block=True)
+        statuses = exp.get_status(ensemble)
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
+        exp.stop(db)
 
 
 @pytest.mark.skipif(not should_run, reason="Test needs Torch to run")
-def test_colocated_db_script(fileutils, wlmutils):
+def test_colocated_db_script(fileutils, wlmutils, mlutils):
     """Test DB Scripts on colocated DB"""
 
+    # Set the experiment name
     exp_name = "test-colocated-db-script"
-    exp = Experiment(exp_name, launcher="local")
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
     torch_script = fileutils.get_test_conf_path("torchscript.py")
 
-    # create colocated model
-    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create the SmartSim Experiment
+    exp = Experiment(exp_name, launcher=test_launcher)
 
+    # Create RunSettings
+    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
+
+    # Create model with colocated database
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
     colo_model.colocate_db(
-        port=wlmutils.get_test_port(),
+        port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
         debug=True,
-        ifname="lo",
+        ifname=test_interface,
     )
 
+    # Create string for script creation
     torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n"
 
-    colo_model.add_script("test_script1", script_path=torch_script, device="CPU")
-    colo_model.add_script("test_script2", script=torch_script_str, device="CPU")
+    # Add script via file
+    colo_model.add_script(
+        "test_script1",
+        script_path=torch_script,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
+    # Add script via string
+    colo_model.add_script(
+        "test_script2",
+        script=torch_script_str,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
 
     # Assert we have added both models
     assert len(colo_model._db_scripts) == 2
 
     for db_script in colo_model._db_scripts:
-        print(db_script)
+        logger.debug(db_script)
 
-    exp.start(colo_model, block=True)
-    statuses = exp.get_status(colo_model)
-    assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    try:
+        exp.start(colo_model, block=True)
+        statuses = exp.get_status(colo_model)
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
+        exp.stop(colo_model)
 
 
 @pytest.mark.skipif(not should_run, reason="Test needs Torch to run")
-def test_colocated_db_script_ensemble(fileutils, wlmutils):
+def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils):
     """Test DB Scripts on colocated DB from ensemble, first colocating DB,
     then adding script.
     """
 
+    # Set experiment name
     exp_name = "test-colocated-db-script"
-    exp = Experiment(exp_name, launcher="local")
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
     torch_script = fileutils.get_test_conf_path("torchscript.py")
 
-    # create colocated model
-    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create SmartSim Experiment
+    exp = Experiment(exp_name, launcher=test_launcher)
+
+    # Create RunSettings
+    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
 
+    # Create SmartSim Ensemble with two identical models
     colo_ensemble = exp.create_ensemble(
         "colocated_ensemble", run_settings=colo_settings, replicas=2
     )
     colo_ensemble.set_path(test_dir)
 
+    # Create a SmartSim model
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
 
+    # Colocate a db with each ensemble entity and add a script
+    # to each entity via file
     for i, entity in enumerate(colo_ensemble):
         entity.disable_key_prefixing()
         entity.colocate_db(
-            port=wlmutils.get_test_port() + i,
+            port=test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
             debug=True,
-            ifname="lo",
+            ifname=test_interface,
         )
 
-        entity.add_script("test_script1", script_path=torch_script, device="CPU")
+        entity.add_script(
+            "test_script1",
+            script_path=torch_script,
+            device=test_device,
+            devices_per_node=test_num_gpus
+        )
 
+    # Colocate a db with the non-ensemble Model
     colo_model.colocate_db(
-        port=wlmutils.get_test_port() + len(colo_ensemble),
+        port=test_port + len(colo_ensemble),
         db_cpus=1,
         limit_app_cpus=False,
         debug=True,
-        ifname="lo",
+        ifname=test_interface,
     )
 
+    # Add a script to the non-ensemble model
     torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n"
+    colo_ensemble.add_script(
+        "test_script2",
+        script=torch_script_str,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
 
-    colo_ensemble.add_script("test_script2", script=torch_script_str, device="CPU")
-
+    # Add the third SmartSim model to the ensemble
     colo_ensemble.add_model(colo_model)
-    colo_model.add_script("test_script1", script_path=torch_script, device="CPU")
+
+    # Add another script via file to the entire ensemble
+    colo_model.add_script(
+        "test_script1",
+        script_path=torch_script,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
 
     # Assert we have added one model to the ensemble
     assert len(colo_ensemble._db_scripts) == 1
     # Assert we have added both models to each entity
     assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble])
 
-    exp.start(colo_ensemble, block=True)
-    statuses = exp.get_status(colo_ensemble)
-    assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    # Launch and check successful completion
+    try:
+        exp.start(colo_ensemble, block=True)
+        statuses = exp.get_status(colo_ensemble)
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
+        exp.stop(colo_ensemble)
 
 
 @pytest.mark.skipif(not should_run, reason="Test needs Torch to run")
-def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils):
+def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils):
     """Test DB Scripts on colocated DB from ensemble, first adding the
     script to the ensemble, then colocating the DB"""
 
+    # Set Experiment name
     exp_name = "test-colocated-db-script"
-    exp = Experiment(exp_name, launcher="local")
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
     torch_script = fileutils.get_test_conf_path("torchscript.py")
 
-    # create colocated model
-    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create SmartSim Experiment
+    exp = Experiment(exp_name, launcher=test_launcher)
+
+    # Create RunSettings
+    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
 
+    # Create Ensemble with two identical SmartSim Model
     colo_ensemble = exp.create_ensemble(
         "colocated_ensemble", run_settings=colo_settings, replicas=2
     )
     colo_ensemble.set_path(test_dir)
 
+    # Create an additional SmartSim Model entity
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
 
+    # Add a script via string to the ensemble members
     torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n"
-    colo_ensemble.add_script("test_script2", script=torch_script_str, device="CPU")
+    colo_ensemble.add_script(
+        "test_script2",
+        script=torch_script_str,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
 
+    # Add a colocated database to the ensemble members
+    # and then add a script via file
     for i, entity in enumerate(colo_ensemble):
         entity.disable_key_prefixing()
         entity.colocate_db(
-            port=wlmutils.get_test_port() + i,
+            port=test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
             debug=True,
-            ifname="lo",
+            ifname=test_interface,
         )
 
-        entity.add_script("test_script1", script_path=torch_script, device="CPU")
+        entity.add_script(
+            "test_script1",
+            script_path=torch_script,
+            device=test_device,
+            devices_per_node=test_num_gpus
+        )
 
+    # Add a colocated database to the non-ensemble SmartSim Model
     colo_model.colocate_db(
-        port=wlmutils.get_test_port() + len(colo_ensemble),
+        port=test_port + len(colo_ensemble),
         db_cpus=1,
         limit_app_cpus=False,
         debug=True,
-        ifname="lo",
+        ifname=test_interface
     )
 
+    # Add the non-ensemble SmartSim Model to the Ensemble
+    # and then add a script via file
     colo_ensemble.add_model(colo_model)
-    colo_model.add_script("test_script1", script_path=torch_script, device="CPU")
+    colo_model.add_script(
+        "test_script1",
+        script_path=torch_script,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
 
     # Assert we have added one model to the ensemble
     assert len(colo_ensemble._db_scripts) == 1
     # Assert we have added both models to each entity
     assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble])
 
-    exp.start(colo_ensemble, block=True)
-    statuses = exp.get_status(colo_ensemble)
-    assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    # Launch and check successful completion
+    try:
+        exp.start(colo_ensemble, block=True)
+        statuses = exp.get_status(colo_ensemble)
+        assert all([stat == status.STATUS_COMPLETED for stat in statuses])
+    finally:
+        exp.stop(colo_ensemble)
 
 
 @pytest.mark.skipif(not should_run, reason="Test needs Torch to run")
-def test_db_script_errors(fileutils, wlmutils):
+def test_db_script_errors(fileutils, wlmutils, mlutils):
     """Test DB Scripts error when setting a serialized function on colocated DB"""
 
+    # Set Experiment name
     exp_name = "test-colocated-db-script"
-    exp = Experiment(exp_name, launcher="local")
 
-    # get test setup
+    # Retrieve parameters from testing environment
+    test_launcher = wlmutils.get_test_launcher()
+    test_interface = wlmutils.get_test_interface()
+    test_port = wlmutils.get_test_port()
+    test_device = mlutils.get_test_device()
+    test_num_gpus = mlutils.get_test_num_gpus()
     test_dir = fileutils.make_test_dir()
-    sr_test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
+    test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py")
+    torch_script = fileutils.get_test_conf_path("torchscript.py")
 
-    # create colocated model
-    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+    # Create SmartSim experiment
+    exp = Experiment(exp_name, launcher=test_launcher)
 
+    # Create RunSettings
+    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
+
+    # Create a SmartSim model with a colocated database
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
     colo_model.colocate_db(
-        port=wlmutils.get_test_port(),
+        port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
         debug=True,
-        ifname="lo",
+        ifname=test_interface,
     )
 
+    # Check that an error is raised for adding in-memory
+    # function when using colocated deployment
     with pytest.raises(SSUnsupportedError):
-        colo_model.add_function("test_func", function=timestwo, device="CPU")
-
-    # create colocated model
-    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+        colo_model.add_function(
+            "test_func",
+            function=timestwo,
+            device=test_device,
+            devices_per_node=test_num_gpus
+        )
 
+    # Create ensemble with two identical SmartSim Model entities
+    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
     colo_ensemble = exp.create_ensemble(
         "colocated_ensemble", run_settings=colo_settings, replicas=2
     )
     colo_ensemble.set_path(test_dir)
 
+    # Add a colocated database for each ensemble member
     for i, entity in enumerate(colo_ensemble):
         entity.colocate_db(
-            port=wlmutils.get_test_port() + i,
+            port=test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
             debug=True,
-            ifname="lo",
+            ifname=test_interface,
         )
 
+    # Check that an exception is raised when adding an in-memory
+    # function to the ensemble with colocated databases
     with pytest.raises(SSUnsupportedError):
-        colo_ensemble.add_function("test_func", function=timestwo, device="CPU")
-
-    # create colocated model
-    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
+        colo_ensemble.add_function(
+            "test_func",
+            function=timestwo,
+            device=test_device,
+            devices_per_node=test_num_gpus
+        )
 
+    # Create an ensemble with two identical SmartSim Model entities
+    colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script)
     colo_ensemble = exp.create_ensemble(
         "colocated_ensemble", run_settings=colo_settings, replicas=2
     )
     colo_ensemble.set_path(test_dir)
 
-    colo_ensemble.add_function("test_func", function=timestwo, device="CPU")
+    # Add an in-memory function to the ensemble
+    colo_ensemble.add_function(
+        "test_func",
+        function=timestwo,
+        device=test_device,
+        devices_per_node=test_num_gpus
+    )
 
+    # Check that an error is raised when trying to add
+    # a colocated database to ensemble members that have
+    # an in-memory script
     for i, entity in enumerate(colo_ensemble):
         with pytest.raises(SSUnsupportedError):
             entity.colocate_db(
-                port=wlmutils.get_test_port() + i,
+                port=test_port + i,
                 db_cpus=1,
                 limit_app_cpus=False,
                 debug=True,
-                ifname="lo",
+                ifname=test_interface,
             )
 
+    # Check that an error is raised when trying to add
+    # a colocated database to an Ensemble that has
+    # an in-memory script
     with pytest.raises(SSUnsupportedError):
         colo_ensemble.add_model(colo_model)

From 7b3b4635efb5236eba4c8417a5d8ff371812dd19 Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 16 Jun 2023 14:01:56 -0700
Subject: [PATCH 06/25] Add check for file and func parameters.

---
 smartsim/_core/entrypoints/colocated.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py
index 29b63cce4..a88c037c0 100644
--- a/smartsim/_core/entrypoints/colocated.py
+++ b/smartsim/_core/entrypoints/colocated.py
@@ -141,9 +141,12 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str:
     parser.add_argument("--device", type=str)
     parser.add_argument("--devices_per_node", type=int)
     args = parser.parse_args(db_script)
+
+    if args.file and args.func:
+        raise ValueError("Both file and func cannot be provided.")
+
     if args.func:
         func = args.func.replace("\\n", "\n")
-
         if args.devices_per_node > 1 and args.device.lower() == "gpu":
             client.set_script_multigpu(args.name, func, 0, args.devices_per_node)
         else:
@@ -153,6 +156,8 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str:
             client.set_script_from_file_multigpu(args.name, args.file, 0, args.devices_per_node)
         else:
             client.set_script_from_file(args.name, args.file, args.device)
+    else:
+        raise ValueError("No file or func provided.")
 
     return args.name
 

From ecb03eaf4cb04a963db9758509f9bf3c962622ad Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 16 Jun 2023 14:24:26 -0700
Subject: [PATCH 07/25] Fix comment

---
 smartsim/entity/model.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py
index d191aa6d3..b173a6ce6 100644
--- a/smartsim/entity/model.py
+++ b/smartsim/entity/model.py
@@ -462,10 +462,8 @@ def add_function(
 
         :param name: key to store function under
         :type name: str
-        :param script: TorchScript code
-        :type script: str or byte string, optional
-        :param script_path: path to TorchScript code
-        :type script_path: str, optional
+        :param function: TorchScript function code
+        :type function: str, optional
         :param device: device for script execution, defaults to "CPU"
         :type device: str, optional
         :param devices_per_node: The number of GPU devices available on the host.

From 87898cb3703d760bfc61b0ef87744f8531e8c3b5 Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 16 Jun 2023 16:48:55 -0700
Subject: [PATCH 08/25] Add additional debug info to the dbscript tests.

---
 tests/test_configs/run_dbscript_smartredis.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_configs/run_dbscript_smartredis.py b/tests/test_configs/run_dbscript_smartredis.py
index 367d8d84e..c12515878 100644
--- a/tests/test_configs/run_dbscript_smartredis.py
+++ b/tests/test_configs/run_dbscript_smartredis.py
@@ -39,17 +39,23 @@ def main():
     assert client.poll_model("test_script1", 500, 30)
     client.run_script("test_script1", "average", ["test_array"], ["test_output"])
     returned = client.get_tensor("test_output")
+    print(f"Return value from test_script_1: {returned}")
+    print(f"Expected value from test_script1: {np.mean(array)}")
     assert returned == approx(np.mean(array))
 
     assert client.poll_model("test_script2", 500, 30)
     client.run_script("test_script2", "negate", ["test_array"], ["test_output"])
+    print(f"Return value from test_script_2: {returned}")
     returned = client.get_tensor("test_output")
+    print(f"Expected value from test_script2: {-array}")
 
     assert returned == approx(-array)
 
     if client.model_exists("test_func"):
         client.run_script("test_func", "timestwo", ["test_array"], ["test_output"])
         returned = client.get_tensor("test_output")
+        print(f"Return value from test_func: {returned}")
+        print(f"Expected value from test_func: {2 * array}")
         assert returned == approx(2 * array)
 
     print(f"Test worked!")

From 469819ea4f02413120cc96171638356da181493d Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 16 Jun 2023 17:27:16 -0700
Subject: [PATCH 09/25] Fix placement of debug output.

---
 tests/test_configs/run_dbscript_smartredis.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_configs/run_dbscript_smartredis.py b/tests/test_configs/run_dbscript_smartredis.py
index c12515878..699af61e6 100644
--- a/tests/test_configs/run_dbscript_smartredis.py
+++ b/tests/test_configs/run_dbscript_smartredis.py
@@ -45,10 +45,9 @@ def main():
 
     assert client.poll_model("test_script2", 500, 30)
     client.run_script("test_script2", "negate", ["test_array"], ["test_output"])
-    print(f"Return value from test_script_2: {returned}")
     returned = client.get_tensor("test_output")
+    print(f"Return value from test_script_2: {returned}")
     print(f"Expected value from test_script2: {-array}")
-
     assert returned == approx(-array)
 
     if client.model_exists("test_func"):

From ca71797f8246fff91829da52840124f9a4d64fee Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 16 Jun 2023 23:47:41 -0700
Subject: [PATCH 10/25] Replace colocate_db with colocate_db_tcp.

---
 tests/backends/test_dbmodel.py  | 18 +++++++++---------
 tests/backends/test_dbscript.py | 16 ++++++++--------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py
index 54fa65a43..fe8ea6e86 100644
--- a/tests/backends/test_dbmodel.py
+++ b/tests/backends/test_dbmodel.py
@@ -384,7 +384,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils):
     # Create colocated Model
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
-    colo_model.colocate_db(
+    colo_model.colocate_db_tcp(
         port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
@@ -453,7 +453,7 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils):
     # Create colocated SmartSim Model
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
-    colo_model.colocate_db(
+    colo_model.colocate_db_tcp(
         port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
@@ -518,7 +518,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils):
     # Create a third model with a colocated database
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
-    colo_model.colocate_db(
+    colo_model.colocate_db_tcp(
         port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
@@ -532,7 +532,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils):
 
     # Colocate a database with the ensemble with two ensemble members
     for i, entity in enumerate(colo_ensemble):
-        entity.colocate_db(
+        entity.colocate_db_tcp(
             port=test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
@@ -638,7 +638,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils):
 
     # Colocate a database with the first ensemble members
     for i, entity in enumerate(colo_ensemble):
-        entity.colocate_db(
+        entity.colocate_db_tcp(
             port = test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
@@ -661,7 +661,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils):
     colo_ensemble.add_model(colo_model)
 
     # Colocate a database with the new ensemble member
-    colo_model.colocate_db(
+    colo_model.colocate_db_tcp(
         port=test_port + len(colo_ensemble),
         db_cpus=1,
         limit_app_cpus=False,
@@ -714,7 +714,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils):
     # Create colocated SmartSim Model
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
-    colo_model.colocate_db(
+    colo_model.colocate_db_tcp(
         port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
@@ -741,7 +741,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils):
 
     # Colocate a db with each ensemble member
     for i, entity in enumerate(colo_ensemble):
-        entity.colocate_db(
+        entity.colocate_db_tcp(
             port=test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
@@ -776,7 +776,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils):
     )
     for i, entity in enumerate(colo_ensemble2):
         with pytest.raises(SSUnsupportedError):
-            entity.colocate_db(
+            entity.colocate_db_tcp(
                 port=test_port + i,
                 db_cpus=1,
                 limit_app_cpus=False,
diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py
index a260fcf2b..7c9cf3dc0 100644
--- a/tests/backends/test_dbscript.py
+++ b/tests/backends/test_dbscript.py
@@ -234,7 +234,7 @@ def test_colocated_db_script(fileutils, wlmutils, mlutils):
     # Create model with colocated database
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
-    colo_model.colocate_db(
+    colo_model.colocate_db_tcp(
         port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
@@ -313,7 +313,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils):
     # to each entity via file
     for i, entity in enumerate(colo_ensemble):
         entity.disable_key_prefixing()
-        entity.colocate_db(
+        entity.colocate_db_tcp(
             port=test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
@@ -329,7 +329,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils):
         )
 
     # Colocate a db with the non-ensemble Model
-    colo_model.colocate_db(
+    colo_model.colocate_db_tcp(
         port=test_port + len(colo_ensemble),
         db_cpus=1,
         limit_app_cpus=False,
@@ -418,7 +418,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils):
     # and then add a script via file
     for i, entity in enumerate(colo_ensemble):
         entity.disable_key_prefixing()
-        entity.colocate_db(
+        entity.colocate_db_tcp(
             port=test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
@@ -434,7 +434,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils):
         )
 
     # Add a colocated database to the non-ensemble SmartSim Model
-    colo_model.colocate_db(
+    colo_model.colocate_db_tcp(
         port=test_port + len(colo_ensemble),
         db_cpus=1,
         limit_app_cpus=False,
@@ -492,7 +492,7 @@ def test_db_script_errors(fileutils, wlmutils, mlutils):
     # Create a SmartSim model with a colocated database
     colo_model = exp.create_model("colocated_model", colo_settings)
     colo_model.set_path(test_dir)
-    colo_model.colocate_db(
+    colo_model.colocate_db_tcp(
         port=test_port,
         db_cpus=1,
         limit_app_cpus=False,
@@ -519,7 +519,7 @@ def test_db_script_errors(fileutils, wlmutils, mlutils):
 
     # Add a colocated database for each ensemble member
     for i, entity in enumerate(colo_ensemble):
-        entity.colocate_db(
+        entity.colocate_db_tcp(
             port=test_port + i,
             db_cpus=1,
             limit_app_cpus=False,
@@ -557,7 +557,7 @@ def test_db_script_errors(fileutils, wlmutils, mlutils):
     # an in-memory script
     for i, entity in enumerate(colo_ensemble):
         with pytest.raises(SSUnsupportedError):
-            entity.colocate_db(
+            entity.colocate_db_tcp(
                 port=test_port + i,
                 db_cpus=1,
                 limit_app_cpus=False,

From 2083fdc4b800af4ea57f84aa596181904a610e32 Mon Sep 17 00:00:00 2001
From: Matt Ellis <matthew.ellis@hpe.com>
Date: Wed, 21 Jun 2023 14:52:04 -0500
Subject: [PATCH 11/25] Remove debug statements.

---
 tests/test_configs/run_dbscript_smartredis.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/test_configs/run_dbscript_smartredis.py b/tests/test_configs/run_dbscript_smartredis.py
index 699af61e6..b486b3fcd 100644
--- a/tests/test_configs/run_dbscript_smartredis.py
+++ b/tests/test_configs/run_dbscript_smartredis.py
@@ -39,22 +39,16 @@ def main():
     assert client.poll_model("test_script1", 500, 30)
     client.run_script("test_script1", "average", ["test_array"], ["test_output"])
     returned = client.get_tensor("test_output")
-    print(f"Return value from test_script_1: {returned}")
-    print(f"Expected value from test_script1: {np.mean(array)}")
     assert returned == approx(np.mean(array))
 
     assert client.poll_model("test_script2", 500, 30)
     client.run_script("test_script2", "negate", ["test_array"], ["test_output"])
     returned = client.get_tensor("test_output")
-    print(f"Return value from test_script_2: {returned}")
-    print(f"Expected value from test_script2: {-array}")
     assert returned == approx(-array)
 
     if client.model_exists("test_func"):
         client.run_script("test_func", "timestwo", ["test_array"], ["test_output"])
         returned = client.get_tensor("test_output")
-        print(f"Return value from test_func: {returned}")
-        print(f"Expected value from test_func: {2 * array}")
         assert returned == approx(2 * array)
 
     print(f"Test worked!")

From 40215569839a31c18d343aa1e0c435628df0e659 Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 14:43:33 -0700
Subject: [PATCH 12/25] Fix return type of test_num_gpus

---
 smartsim/_core/config/config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py
index 42efb7441..06701e8f4 100644
--- a/smartsim/_core/config/config.py
+++ b/smartsim/_core/config/config.py
@@ -148,7 +148,7 @@ def log_level(self) -> str:
 
     @property
     def jm_interval(self) -> int:
-        return int(os.environ.get("SMARTSIM_JM_INTERVAL", 10))
+        return int(os.environ.get("SMARTSIM_JM_INTERVAL") or 10)
 
     @property
     def wlm_trials(self) -> int:
@@ -163,8 +163,8 @@ def test_device(self) -> str:  # pragma: no cover
         return os.environ.get("SMARTSIM_TEST_DEVICE", "CPU")
 
     @property
-    def test_num_gpus(self) -> str:  # pragma: no cover
-        return os.environ.get("SMARTSIM_TEST_NUM_GPUS", 1)
+    def test_num_gpus(self) -> int:  # pragma: no cover
+        return int(os.environ.get("SMARTSIM_TEST_NUM_GPUS") or 1)
 
     @property
     def test_port(self) -> int:  # pragma: no cover

From fb61db3a23f8938bd3f2294f52bfe59e1acc9df0 Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 14:47:12 -0700
Subject: [PATCH 13/25] Fix at wlm_trials.

---
 smartsim/_core/config/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py
index 06701e8f4..99db81196 100644
--- a/smartsim/_core/config/config.py
+++ b/smartsim/_core/config/config.py
@@ -152,7 +152,7 @@ def jm_interval(self) -> int:
 
     @property
     def wlm_trials(self) -> int:
-        return int(os.environ.get("SMARTSIM_WLM_TRIALS", 10))
+        return int(os.environ.get("SMARTSIM_WLM_TRIALS") or 10)
 
     @property
     def test_launcher(self) -> str:  # pragma: no cover

From bb298a4e738965c4de6f18bd052ae98b162ce62b Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 14:48:17 -0700
Subject: [PATCH 14/25] Fix global nature of test_num_gpus

---
 conftest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conftest.py b/conftest.py
index 25a8a76a4..9b401fd5a 100644
--- a/conftest.py
+++ b/conftest.py
@@ -595,5 +595,4 @@ def get_test_device():
 
     @staticmethod
     def get_test_num_gpus():
-        global test_num_gpus
         return test_num_gpus

From c439d4dae93652af08784da3201633e211372c6d Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 14:58:54 -0700
Subject: [PATCH 15/25] Fix type hint on colocate_db_tcp

---
 smartsim/entity/model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py
index b173a6ce6..fe2ead1e1 100644
--- a/smartsim/entity/model.py
+++ b/smartsim/entity/model.py
@@ -221,7 +221,7 @@ def colocate_db_uds(
     def colocate_db_tcp(
         self,
         port: int = 6379,
-        ifname: str = "lo",
+        ifname: t.Union[str, list[str]] = "lo",
         db_cpus: int = 1,
         limit_app_cpus: bool = True,
         debug: bool = False,
@@ -252,7 +252,7 @@ def colocate_db_tcp(
         :param port: port to use for orchestrator database, defaults to 6379
         :type port: int, optional
         :param ifname: interface to use for orchestrator, defaults to "lo"
-        :type ifname: str, optional
+        :type ifname: str | list[str], optional
         :param db_cpus: number of cpus to use for orchestrator, defaults to 1
         :type db_cpus: int, optional
         :param limit_app_cpus: whether to limit the number of cpus used by the app, defaults to True

From 467bd92f4c6dc928109e400e2ae46a751959d31f Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 15:16:54 -0700
Subject: [PATCH 16/25] Make consistent the network interface for colocated db.

---
 tests/backends/test_dbmodel.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py
index fe8ea6e86..3705b806a 100644
--- a/tests/backends/test_dbmodel.py
+++ b/tests/backends/test_dbmodel.py
@@ -368,7 +368,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils):
 
     # Retrieve parameters from testing environment
     test_launcher = wlmutils.get_test_launcher()
-    test_interface = "lo"
+    test_interface = wlmutils.get_test_interface()
     test_port = wlmutils.get_test_port()
     test_device = mlutils.get_test_device()
     test_num_gpus = mlutils.get_test_num_gpus()
@@ -436,7 +436,6 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils):
 
     # Retrieve parameters from testing environment
     test_launcher = wlmutils.get_test_launcher()
-    test_interface = "lo"
     test_interface = wlmutils.get_test_interface()
     test_port = wlmutils.get_test_port()
     test_device = mlutils.get_test_device()
@@ -495,7 +494,6 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils):
 
     # Retrieve parameters from testing environment
     test_launcher = wlmutils.get_test_launcher()
-    test_interface = "lo"
     test_interface = wlmutils.get_test_interface()
     test_port = wlmutils.get_test_port()
     test_device = mlutils.get_test_device()
@@ -597,7 +595,6 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils):
 
     # Retrieve parameters from testing environment
     test_launcher = wlmutils.get_test_launcher()
-    test_interface = "lo"
     test_interface = wlmutils.get_test_interface()
     test_port = wlmutils.get_test_port()
     test_device = mlutils.get_test_device()
@@ -697,7 +694,6 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils):
 
     # Retrieve parameters from testing environment
     test_launcher = wlmutils.get_test_launcher()
-    test_interface = "lo"
     test_interface = wlmutils.get_test_interface()
     test_port = wlmutils.get_test_port()
     test_device = mlutils.get_test_device()

From 133aaad88644d0af8d15d39fbba398d4b87189f2 Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 15:37:22 -0700
Subject: [PATCH 17/25] Update docs

---
 doc/testing.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/testing.rst b/doc/testing.rst
index 249752b93..0c27c266b 100644
--- a/doc/testing.rst
+++ b/doc/testing.rst
@@ -105,6 +105,8 @@ are a few other runtime test configuration options for SmartSim
  - ``SMARTSIM_TEST_LAUNCHER``: Workload manager of the system (local by default)
  - ``SMARTSIM_TEST_ACCOUNT``: Project account for allocations (used for customer systems mostly)
  - ``SMARTSIM_TEST_DEVICE``: ``cpu`` or ``gpu``
+ - ``SMARTSIM_TEST_NUM_GPUS``: the number of GPUs to use for model and script testing (defaults to 1)
+ - ``SMARTSIM_TEST_PORT``: the port to use for database communication
  - ``SMARTSIM_TEST_INTERFACE``: network interface to use.
 
 For the ``SMARTSIM_TEST_INTERFACE``, the default is ``ipogif0`` which

From 73749c4fc72771d46d97828d50bd9918b9a3df56 Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 15:45:35 -0700
Subject: [PATCH 18/25] Update docs.

---
 doc/testing.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/testing.rst b/doc/testing.rst
index 0c27c266b..d7091008d 100644
--- a/doc/testing.rst
+++ b/doc/testing.rst
@@ -25,7 +25,7 @@ level of the SmartSim directory::
 
 .. note::
 
-You must have the extra dev dependencies installed in 
+You must have the extra dev dependencies installed in
 your python environment to execute tests.
 
 Install ``dev`` dependencies with ``pip``
@@ -106,7 +106,7 @@ are a few other runtime test configuration options for SmartSim
  - ``SMARTSIM_TEST_ACCOUNT``: Project account for allocations (used for customer systems mostly)
  - ``SMARTSIM_TEST_DEVICE``: ``cpu`` or ``gpu``
  - ``SMARTSIM_TEST_NUM_GPUS``: the number of GPUs to use for model and script testing (defaults to 1)
- - ``SMARTSIM_TEST_PORT``: the port to use for database communication
+ - ``SMARTSIM_TEST_PORT``: the port to use for database communication (defaults to 6780)
  - ``SMARTSIM_TEST_INTERFACE``: network interface to use.
 
 For the ``SMARTSIM_TEST_INTERFACE``, the default is ``ipogif0`` which

From ba3e53931f698335243a28dbacc19565ed92b5be Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 17:28:20 -0700
Subject: [PATCH 19/25] Test setting lo interface for CI/CD

---
 .github/workflows/run_tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 91e717c09..c9c9e5288 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -121,6 +121,7 @@ jobs:
       - name: Run Pytest
         run: |
           echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV
+          echo "SMARTSIM_TEST_INTERFACE=lo" >> $GITHUB_ENV
           py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ ./tests/
 
       - name: Upload Pytest coverage to Codecov

From 4f3a329430300142ff66f146ae27cd7070080c2b Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 20:20:50 -0700
Subject: [PATCH 20/25] Check CI/CD env var

---
 conftest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/conftest.py b/conftest.py
index 9b401fd5a..c965dcef0 100644
--- a/conftest.py
+++ b/conftest.py
@@ -74,6 +74,10 @@ def print_test_configuration():
     global test_nic
     global test_alloc_specs_path
     global test_port
+
+    env_var = os.environ.get("SMARTSIM_TEST_INTERFACE")
+    print(f"THE ENVIRONMENT VALUE OF SMARTSIM_TEST_INTERFACE is {env_var}")
+
     print("TEST_SMARTSIM_LOCATION:", smartsim.__path__)
     print("TEST_PATH:", test_path)
     print("TEST_LAUNCHER:", test_launcher)

From 0374a70f4858e1d472c39e5cea4bcf11816e8bfe Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 21:43:55 -0700
Subject: [PATCH 21/25] Fix env var test for run_tests.yml

---
 .github/workflows/run_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index c9c9e5288..8d622b768 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -64,6 +64,7 @@ jobs:
 
     env:
       SMARTSIM_REDISAI: ${{ matrix.rai }}
+      SMARTSIM_TEST_INTERFACE: lo
 
     steps:
       - uses: actions/checkout@v2
@@ -121,7 +122,6 @@ jobs:
       - name: Run Pytest
         run: |
           echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV
-          echo "SMARTSIM_TEST_INTERFACE=lo" >> $GITHUB_ENV
           py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ ./tests/
 
       - name: Upload Pytest coverage to Codecov

From 5f29fc84a74fcf9e1f9cd2b6ad84bdaee606be48 Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 22:40:31 -0700
Subject: [PATCH 22/25] Fix error in the test_dbmodel.py port numbers.

---
 tests/backends/test_dbmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py
index 3705b806a..950782b56 100644
--- a/tests/backends/test_dbmodel.py
+++ b/tests/backends/test_dbmodel.py
@@ -531,7 +531,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils):
     # Colocate a database with the ensemble with two ensemble members
     for i, entity in enumerate(colo_ensemble):
         entity.colocate_db_tcp(
-            port=test_port + i,
+            port=test_port + i + 1,
             db_cpus=1,
             limit_app_cpus=False,
             debug=True,

From 4d03b6cbe52468332ff6760bf6cdfcfd62f13df3 Mon Sep 17 00:00:00 2001
From: Matthew Ellis <matthew.ellis@hpe.com>
Date: Fri, 23 Jun 2023 23:14:30 -0700
Subject: [PATCH 23/25] Remove conftest.py debug statement.

---
 conftest.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/conftest.py b/conftest.py
index c965dcef0..1fdf0fa88 100644
--- a/conftest.py
+++ b/conftest.py
@@ -75,9 +75,6 @@ def print_test_configuration():
     global test_alloc_specs_path
     global test_port
 
-    env_var = os.environ.get("SMARTSIM_TEST_INTERFACE")
-    print(f"THE ENVIRONMENT VALUE OF SMARTSIM_TEST_INTERFACE is {env_var}")
-
     print("TEST_SMARTSIM_LOCATION:", smartsim.__path__)
     print("TEST_PATH:", test_path)
     print("TEST_LAUNCHER:", test_launcher)

From 66ebb312a87c90c991384b5828b8a5d8ae23181b Mon Sep 17 00:00:00 2001
From: Matt Ellis <matthew.ellis@hpe.com>
Date: Wed, 28 Jun 2023 12:48:37 -0500
Subject: [PATCH 24/25] Change default testing interface to lo instead of
 ipogif.

---
 .github/workflows/run_tests.yml | 1 -
 smartsim/_core/config/config.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 8d622b768..91e717c09 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -64,7 +64,6 @@ jobs:
 
     env:
       SMARTSIM_REDISAI: ${{ matrix.rai }}
-      SMARTSIM_TEST_INTERFACE: lo
 
     steps:
       - uses: actions/checkout@v2
diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py
index 99db81196..4c69f2943 100644
--- a/smartsim/_core/config/config.py
+++ b/smartsim/_core/config/config.py
@@ -188,7 +188,7 @@ def test_interface(self) -> t.List[str]:  # pragma: no cover
         elif "ib0" in net_if_addrs:
             return ["ib0"]
         # default to aries network
-        return ["ipogif0"]
+        return ["lo"]
 
     @property
     def test_account(self) -> t.Optional[str]:  # pragma: no cover

From af388068f65bda9ff50ae9ece6a4ac1cf320b5ee Mon Sep 17 00:00:00 2001
From: Matt Ellis <matthew.ellis@hpe.com>
Date: Wed, 28 Jun 2023 12:52:46 -0500
Subject: [PATCH 25/25] Update changelog.

---
 doc/changelog.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/changelog.rst b/doc/changelog.rst
index d432722c6..9110aae58 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -43,7 +43,8 @@ Detailed notes
   unexpectedly due to copy-by-ref (PR305_)
 - The underlying code for Model.add_ml_model() and Model.add_script() was fixed
   to correctly handle multi-GPU configurations.  Tests were updated to run on
-  non-local launchers.  Documentation was updated and fixed. (PR304_)
+  non-local launchers.  Documentation was updated and fixed.  Also, the default
+  testing interface has been changed to lo instead of ipogif. (PR304_)
 - Typehints have been added. A makefile target `make check-mypy` executes static
   analysis with mypy. (PR295_, PR303_)
 - Simplify code in `random_permutations` parameter generation strategy (PR300_)