CrayLabs · mellis13 · Jun 28, 2023 · Jun 14, 2023 · Jun 14, 2023 · Jun 14, 2023
diff --git a/conftest.py b/conftest.py
@@ -51,6 +51,7 @@
 test_dir = os.path.join(test_path, "tests", "test_output")
 test_launcher = CONFIG.test_launcher
 test_device = CONFIG.test_device
+test_num_gpus = CONFIG.test_num_gpus
 test_nic = CONFIG.test_interface
 test_alloc_specs_path = os.getenv("SMARTSIM_TEST_ALLOC_SPEC_SHEET_PATH", None)
 test_port = CONFIG.test_port
@@ -73,6 +74,7 @@ def print_test_configuration():
     global test_nic
     global test_alloc_specs_path
     global test_port
+
     print("TEST_SMARTSIM_LOCATION:", smartsim.__path__)
     print("TEST_PATH:", test_path)
     print("TEST_LAUNCHER:", test_launcher)
@@ -591,3 +593,7 @@ class MLUtils:
     def get_test_device():
         global test_device
         return test_device
+
+    @staticmethod
+    def get_test_num_gpus():
+        return test_num_gpus
diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst
@@ -431,6 +431,10 @@ Model
    Model.colocate_db
    Model.colocate_db_tcp
    Model.colocate_db_uds
+   Model.colocated
+   Model.add_ml_model
+   Model.add_script
+   Model.add_function
    Model.params_to_args
    Model.register_incoming_entity
    Model.enable_key_prefixing

diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -25,6 +25,7 @@ Description
 
 A full list of changes and detailed notes can be found below:
 
+- Fix add_ml_model() and add_script() documentation, tests, and code
 - Remove wait time associated with Experiment launch summary
 - Update and rename Redis conf file
 - Migrate from redis-py-cluster to redis-py
@@ -38,8 +39,12 @@ A full list of changes and detailed notes can be found below:
 
 Detailed notes
 
-- Fix defect where dictionaries used to create run settings can be changed 
+- Fix defect where dictionaries used to create run settings can be changed
   unexpectedly due to copy-by-ref (PR305_)
+- The underlying code for Model.add_ml_model() and Model.add_script() was fixed
+  to correctly handle multi-GPU configurations.  Tests were updated to run on
+  non-local launchers.  Documentation was updated and fixed.  Also, the default
+  testing interface has been changed to lo instead of ipogif. (PR304_)
 - Typehints have been added. A makefile target `make check-mypy` executes static
   analysis with mypy. (PR295_, PR303_)
 - Simplify code in `random_permutations` parameter generation strategy (PR300_)
@@ -48,17 +53,18 @@ Detailed notes
 - Migrate from redis-py-cluster to redis-py for cluster status checks (PR292_)
 - Update full test suite to no longer require a tensorflow wheel to be available at test time. (PR291_)
 - Correct spelling of colocated in doc strings (PR290_)
-- Deprecated launcher-specific orchestrators, constants, and ML 
+- Deprecated launcher-specific orchestrators, constants, and ML
   utilities were removed. (PR289_)
 - Relax the coloredlogs version to be greater than 10.0 (PR288_)
 - Update the Github Actions runner image from `macos-10.15`` to `macos-12``. The
   former began deprecation in May 2022 and was finally removed in May 2023. (PR285_)
-- The Fortran tutorials had not been fully updated to show how to handle 
+- The Fortran tutorials had not been fully updated to show how to handle
   return/error codes. These have now all been updated. (PR284_)
-- Orchestrator and Colocated DB now accept a list of interfaces to bind to. The 
+- Orchestrator and Colocated DB now accept a list of interfaces to bind to. The
   argument name is still `interface` for backward compatibility reasons. (PR281_)
 
 .. _PR305: https://github.com/CrayLabs/SmartSim/pull/305
+.. _PR304: https://github.com/CrayLabs/SmartSim/pull/304
 .. _PR303: https://github.com/CrayLabs/SmartSim/pull/303
 .. _PR300: https://github.com/CrayLabs/SmartSim/pull/300
 .. _PR298: https://github.com/CrayLabs/SmartSim/pull/298

diff --git a/doc/testing.rst b/doc/testing.rst
@@ -25,7 +25,7 @@ level of the SmartSim directory::
 
 .. note::
 
-You must have the extra dev dependencies installed in 
+You must have the extra dev dependencies installed in
 your python environment to execute tests.
 
 Install ``dev`` dependencies with ``pip``
@@ -105,6 +105,8 @@ are a few other runtime test configuration options for SmartSim
  - ``SMARTSIM_TEST_LAUNCHER``: Workload manager of the system (local by default)
  - ``SMARTSIM_TEST_ACCOUNT``: Project account for allocations (used for customer systems mostly)
  - ``SMARTSIM_TEST_DEVICE``: ``cpu`` or ``gpu``
+ - ``SMARTSIM_TEST_NUM_GPUS``: the number of GPUs to use for model and script testing (defaults to 1)
+ - ``SMARTSIM_TEST_PORT``: the port to use for database communication (defaults to 6780)
  - ``SMARTSIM_TEST_INTERFACE``: network interface to use.
 
 For the ``SMARTSIM_TEST_INTERFACE``, the default is ``ipogif0`` which

diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py
@@ -79,6 +79,10 @@
 #  - CPU or GPU for model serving tests
 #  - Default: CPU
 #
+# SMARTSIM_TEST_NUM_GPUS
+#  - Number of GPUs on the host for testing
+#  - Defaults: 1
+#
 # SMARTSIM_TEST_ACCOUNT
 #  - Account used to run full launcher test suite on external systems
 #  - Default: None
@@ -144,11 +148,11 @@ def log_level(self) -> str:
 
     @property
     def jm_interval(self) -> int:
-        return int(os.environ.get("SMARTSIM_JM_INTERVAL", 10))
+        return int(os.environ.get("SMARTSIM_JM_INTERVAL") or 10)
 
     @property
     def wlm_trials(self) -> int:
-        return int(os.environ.get("SMARTSIM_WLM_TRIALS", 10))
+        return int(os.environ.get("SMARTSIM_WLM_TRIALS") or 10)
 
     @property
     def test_launcher(self) -> str:  # pragma: no cover
@@ -158,6 +162,10 @@ def test_launcher(self) -> str:  # pragma: no cover
     def test_device(self) -> str:  # pragma: no cover
         return os.environ.get("SMARTSIM_TEST_DEVICE", "CPU")
 
+    @property
+    def test_num_gpus(self) -> int:  # pragma: no cover
+        return int(os.environ.get("SMARTSIM_TEST_NUM_GPUS") or 1)
+
     @property
     def test_port(self) -> int:  # pragma: no cover
         return int(os.environ.get("SMARTSIM_TEST_PORT", 6780))
@@ -180,7 +188,7 @@ def test_interface(self) -> t.List[str]:  # pragma: no cover
         elif "ib0" in net_if_addrs:
             return ["ib0"]
         # default to aries network
-        return ["ipogif0"]
+        return ["lo"]
 
     @property
     def test_account(self) -> t.Optional[str]:  # pragma: no cover

diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py
@@ -92,7 +92,22 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str:
     if args.outputs:
         outputs = list(args.outputs)
 
-    if args.devices_per_node == 1:
+    # devices_per_node being greater than one only applies
+    # to GPU devices
+    if args.devices_per_node > 1 and args.device.lower() == "gpu":
+        client.set_model_from_file_multigpu(
+            args.name,
+            args.file,
+            args.backend,
+            0,
+            args.devices_per_node,
+            args.batch_size,
+            args.min_batch_size,
+            args.tag,
+            inputs,
+            outputs
+        )
+    else:
         client.set_model_from_file(
             args.name,
             args.file,
@@ -102,21 +117,8 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str:
             args.min_batch_size,
             args.tag,
             inputs,
-            outputs,
+            outputs
         )
-    else:
-        for device_num in range(args.devices_per_node):
-            client.set_model_from_file(
-                args.name,
-                args.file,
-                args.backend,
-                args.device + f":{device_num}",
-                args.batch_size,
-                args.min_batch_size,
-                args.tag,
-                inputs,
-                outputs,
-            )
 
     return args.name
 
@@ -139,22 +141,23 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str:
     parser.add_argument("--device", type=str)
     parser.add_argument("--devices_per_node", type=int)
     args = parser.parse_args(db_script)
+
+    if args.file and args.func:
+        raise ValueError("Both file and func cannot be provided.")
+
     if args.func:
         func = args.func.replace("\\n", "\n")
-
-        if args.devices_per_node == 1:
-            client.set_script(args.name, func, args.device)
+        if args.devices_per_node > 1 and args.device.lower() == "gpu":
+            client.set_script_multigpu(args.name, func, 0, args.devices_per_node)
         else:
-            for device_num in range(args.devices_per_node):
-                client.set_script(args.name, func, args.device + f":{device_num}")
+            client.set_script(args.name, func, args.device)
     elif args.file:
-        if args.devices_per_node == 1:
-            client.set_script_from_file(args.name, args.file, args.device)
+        if args.devices_per_node > 1 and args.device.lower() == "gpu":
+            client.set_script_from_file_multigpu(args.name, args.file, 0, args.devices_per_node)
         else:
-            for device_num in range(args.devices_per_node):
-                client.set_script_from_file(
-                    args.name, args.file, args.device + f":{device_num}"
-                )
+            client.set_script_from_file(args.name, args.file, args.device)
+    else:
+        raise ValueError("No file or func provided.")
 
     return args.name
 

diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py
@@ -75,7 +75,7 @@ def __init__(
         self._db_models: t.List[DBModel] = []
         self._db_scripts: t.List[DBScript] = []
         self.files: t.Optional[EntityFiles] = None
-        
+
     @property
     def colocated(self) -> bool:
         """Return True if this Model will run with a colocated Orchestrator"""
@@ -221,7 +221,7 @@ def colocate_db_uds(
     def colocate_db_tcp(
         self,
         port: int = 6379,
-        ifname: str = "lo",
+        ifname: t.Union[str, list[str]] = "lo",
         db_cpus: int = 1,
         limit_app_cpus: bool = True,
         debug: bool = False,
@@ -252,7 +252,7 @@ def colocate_db_tcp(
         :param port: port to use for orchestrator database, defaults to 6379
         :type port: int, optional
         :param ifname: interface to use for orchestrator, defaults to "lo"
-        :type ifname: str, optional
+        :type ifname: str | list[str], optional
         :param db_cpus: number of cpus to use for orchestrator, defaults to 1
         :type db_cpus: int, optional
         :param limit_app_cpus: whether to limit the number of cpus used by the app, defaults to True
@@ -356,14 +356,18 @@ def add_ml_model(
 
         :param name: key to store model under
         :type name: str
-        :param model: model in memory
+        :param backend: name of the backend (TORCH, TF, TFLITE, ONNX)
+        :type backend: str
+        :param model: A model in memory (only supported for non-colocated orchestrators)
         :type model: byte string, optional
         :param model_path: serialized model
         :type model_path: file path to model
-        :param backend: name of the backend (TORCH, TF, TFLITE, ONNX)
-        :type backend: str
         :param device: name of device for execution, defaults to "CPU"
         :type device: str, optional
+        :param devices_per_node: The number of GPU devices available on the host.
+               This parameter only applies to GPU devices and will be ignored if device
+               is specified as GPU.
+        :type devices_per_node: int
         :param batch_size: batch size for execution, defaults to 0
         :type batch_size: int, optional
         :param min_batch_size: minimum batch size for model execution, defaults to 0
@@ -415,13 +419,15 @@ def add_script(
 
         :param name: key to store script under
         :type name: str
-        :param script: TorchScript code
+        :param script: TorchScript code (only supported for non-colocated orchestrators)
         :type script: str, optional
         :param script_path: path to TorchScript code
         :type script_path: str, optional
         :param device: device for script execution, defaults to "CPU"
         :type device: str, optional
-        :param devices_per_node: number of devices on each host
+        :param devices_per_node: The number of GPU devices available on the host.
+               This parameter only applies to GPU devices and will be ignored if device
+               is specified as GPU.
         :type devices_per_node: int
         """
         db_script = DBScript(
@@ -456,13 +462,13 @@ def add_function(
 
         :param name: key to store function under
         :type name: str
-        :param script: TorchScript code
-        :type script: str or byte string, optional
-        :param script_path: path to TorchScript code
-        :type script_path: str, optional
+        :param function: TorchScript function code
+        :type function: str, optional
         :param device: device for script execution, defaults to "CPU"
         :type device: str, optional
-        :param devices_per_node: number of devices on each host
+        :param devices_per_node: The number of GPU devices available on the host.
+               This parameter only applies to GPU devices and will be ignored if device
+               is specified as GPU.
         :type devices_per_node: int
         """
         db_script = DBScript(