CrayLabs · amandarichardsonn · Mar 28, 2024 · Mar 20, 2024 · Mar 21, 2024 · Mar 21, 2024
diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -18,6 +18,7 @@ To be released at some future point in time
 
 Description
 
+- Promote device options to an Enum
 - Update telemetry monitor, add telemetry collectors
 - Add method to specify node features for a Slurm job
 - Colo Orchestrator setup now blocks application start until setup finished
@@ -33,6 +34,7 @@ Description
 
 Detailed Notes
 
+- Promote devices to a dedicated Enum type throughout the SmartSim code base.
 - Update the telemetry monitor to enable retrieval of metrics on a scheduled
   interval. Switch basic experiment tracking telemetry to default to on. Add
   database metric collectors. Improve telemetry monitor logging. Create
@@ -70,6 +72,7 @@ Detailed Notes
 - Remove previously deprecated behavior present in test suite on machines with
   Slurm and Open MPI. (SmartSim-PR520_)
 
+.. _SmartSim-PR498: https://github.com/CrayLabs/SmartSim/pull/498
 .. _SmartSim-PR460: https://github.com/CrayLabs/SmartSim/pull/460
 .. _SmartSim-PR512: https://github.com/CrayLabs/SmartSim/pull/512
 .. _SmartSim-PR529: https://github.com/CrayLabs/SmartSim/pull/529

diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py
@@ -43,7 +43,7 @@
     VersionConflictError,
     Versioner,
 )
-from smartsim._core._install.builder import BuildError
+from smartsim._core._install.builder import BuildError, Device
 from smartsim._core.config import CONFIG
 from smartsim._core.utils.helpers import installed_redisai_backends
 from smartsim.error import SSConfigError
@@ -54,8 +54,6 @@
 # NOTE: all smartsim modules need full paths as the smart cli
 #       may be installed into a different directory.
 
-
-_TDeviceStr = t.Literal["cpu", "gpu"]
 _TPinningStr = t.Literal["==", "!=", ">=", ">", "<=", "<", "~="]
 
 
@@ -134,7 +132,7 @@ def build_database(
 def build_redis_ai(
     build_env: BuildEnv,
     versions: Versioner,
-    device: _TDeviceStr,
+    device: Device,
     use_torch: bool = True,
     use_tf: bool = True,
     use_onnx: bool = False,
@@ -143,7 +141,7 @@ def build_redis_ai(
     verbose: bool = False,
 ) -> None:
     # make sure user isn't trying to do something silly on MacOS
-    if build_env.PLATFORM == "darwin" and device == "gpu":
+    if build_env.PLATFORM == "darwin" and device == Device.GPU:
         raise BuildError("SmartSim does not support GPU on MacOS")
 
     # decide which runtimes to build
@@ -154,7 +152,7 @@ def build_redis_ai(
         ["ONNX", versions.ONNX, color_bool(use_onnx)],
     ]
     print(tabulate(backends_table, tablefmt="fancy_outline"), end="\n\n")
-    print(f"Building for GPU support: {color_bool(device == 'gpu')}\n")
+    print(f"Building for GPU support: {color_bool(device == Device.GPU)}\n")
 
     if not check_backends_install():
         sys.exit(1)
@@ -195,7 +193,7 @@ def build_redis_ai(
     else:
         # get the build environment, update with CUDNN env vars
         # if present and building for GPU, otherwise warn the user
-        if device == "gpu":
+        if device == Device.GPU:
             gpu_env = build_env.get_cudnn_env()
             cudnn_env_vars = [
                 "CUDNN_LIBRARY",
@@ -226,18 +224,16 @@ def build_redis_ai(
         logger.info("ML Backends and RedisAI build complete!")
 
 
-def check_py_torch_version(versions: Versioner, device_in: _TDeviceStr = "cpu") -> None:
+def check_py_torch_version(versions: Versioner, device: Device = Device.CPU) -> None:
     """Check Python environment for TensorFlow installation"""
-
-    device = device_in.lower()
     if BuildEnv.is_macos():
-        if device == "gpu":
+        if device == Device.GPU:
             raise BuildError("SmartSim does not support GPU on MacOS")
         device_suffix = ""
     else:  # linux
-        if device == "cpu":
+        if device == Device.CPU:
             device_suffix = versions.TORCH_CPU_SUFFIX
-        elif device == "gpu":
+        elif device == Device.GPU:
             device_suffix = versions.TORCH_CUDA_SUFFIX
         else:
             raise BuildError("Unrecognized device requested")
@@ -261,7 +257,9 @@ def check_py_torch_version(versions: Versioner, device_in: _TDeviceStr = "cpu")
             "Torch version not found in python environment. "
             "Attempting to install via `pip`"
         )
-        wheel_device = device if device == "cpu" else device_suffix.replace("+", "")
+        wheel_device = (
+            device.value if device == Device.CPU else device_suffix.replace("+", "")
+        )
         pip(
             "install",
             "--extra-index-url",
@@ -363,8 +361,7 @@ def execute(
 ) -> int:
     verbose = args.v
     keydb = args.keydb
-    device: _TDeviceStr = args.device
-
+    device = Device(args.device.lower())
     # torch and tf build by default
     pt = not args.no_pt  # pylint: disable=invalid-name
     tf = not args.no_tf  # pylint: disable=invalid-name
@@ -453,8 +450,8 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
     parser.add_argument(
         "--device",
         type=str.lower,
-        default="cpu",
-        choices=["cpu", "gpu"],
+        default=Device.CPU.value,
+        choices=[device.value for device in Device],
         help="Device to build ML runtimes for",
     )
     parser.add_argument(

diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py
@@ -40,6 +40,7 @@
 
 from smartsim import Experiment
 from smartsim._core._cli.utils import SMART_LOGGER_FORMAT
+from smartsim._core._install.builder import Device
 from smartsim._core.utils.helpers import installed_redisai_backends
 from smartsim.log import get_logger
 
@@ -61,9 +62,6 @@
     _TemporaryDirectory = tempfile.TemporaryDirectory
 
 
-_TCapitalDeviceStr = t.Literal["CPU", "GPU"]
-
-
 class _VerificationTempDir(_TemporaryDirectory):
     """A Temporary directory to be used as a context manager that will only
     clean itself up if no error is raised within its context
@@ -88,7 +86,7 @@ def execute(
     simple experiment
     """
     backends = installed_redisai_backends()
-    device: _TCapitalDeviceStr = args.device.upper()
+    device: Device = Device(args.device)
     try:
         with contextlib.ExitStack() as ctx:
             temp_dir = ctx.enter_context(_VerificationTempDir(dir=os.getcwd()))
@@ -98,7 +96,7 @@ def execute(
                     "SR_LOG_FILE", os.path.join(temp_dir, "smartredis.log")
                 ),
             }
-            if device == "GPU":
+            if device == Device.GPU:
                 validate_env["CUDA_VISIBLE_DEVICES"] = "0"
             ctx.enter_context(_env_vars_set_to(validate_env))
             test_install(
@@ -136,16 +134,16 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
     parser.add_argument(
         "--device",
         type=str.lower,
-        default="cpu",
-        choices=["cpu", "gpu"],
+        default=Device.CPU.value,
+        choices=[device.value for device in Device],
         help="Device to test the ML backends against",
     )
 
 
 def test_install(
     location: str,
     port: t.Optional[int],
-    device: _TCapitalDeviceStr,
+    device: Device,
     with_tf: bool,
     with_pt: bool,
     with_onnx: bool,
@@ -214,7 +212,7 @@ def _find_free_port() -> int:
         return int(port)
 
 
-def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) -> None:
+def _test_tf_install(client: Client, tmp_dir: str, device: Device) -> None:
     recv_conn, send_conn = mp.Pipe(duplex=False)
     # Build the model in a subproc so that keras does not hog the gpu
     proc = mp.Process(target=_build_tf_frozen_model, args=(send_conn, tmp_dir))
@@ -236,7 +234,12 @@ def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) -
         ) from e
 
     client.set_model_from_file(
-        "keras-fcn", model_path, "TF", device=device, inputs=inputs, outputs=outputs
+        "keras-fcn",
+        model_path,
+        "TF",
+        device=device.value.upper(),
+        inputs=inputs,
+        outputs=outputs,
     )
     client.put_tensor("keras-input", np.random.rand(1, 28, 28).astype(np.float32))
     client.run_model("keras-fcn", inputs=["keras-input"], outputs=["keras-output"])
@@ -264,7 +267,7 @@ def _build_tf_frozen_model(conn: "Connection", tmp_dir: str) -> None:
     conn.send((model_path, inputs, outputs))
 
 
-def _test_torch_install(client: Client, device: _TCapitalDeviceStr) -> None:
+def _test_torch_install(client: Client, device: Device) -> None:
     import torch
     from torch import nn
 
@@ -276,7 +279,7 @@ def __init__(self) -> None:
         def forward(self, x: torch.Tensor) -> torch.Tensor:
             return self.conv(x)
 
-    if device == "GPU":
+    if device == Device.GPU:
         device_ = torch.device("cuda")
     else:
         device_ = torch.device("cpu")
@@ -292,13 +295,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
     torch.jit.save(traced, buffer)  # type: ignore[no-untyped-call]
     model = buffer.getvalue()
 
-    client.set_model("torch-nn", model, backend="TORCH", device=device)
+    client.set_model("torch-nn", model, backend="TORCH", device=device.value.upper())
     client.put_tensor("torch-in", torch.rand(1, 1, 3, 3).numpy())
     client.run_model("torch-nn", inputs=["torch-in"], outputs=["torch-out"])
     client.get_tensor("torch-out")
 
 
-def _test_onnx_install(client: Client, device: _TCapitalDeviceStr) -> None:
+def _test_onnx_install(client: Client, device: Device) -> None:
     from skl2onnx import to_onnx
     from sklearn.cluster import KMeans
 
@@ -311,7 +314,7 @@ def _test_onnx_install(client: Client, device: _TCapitalDeviceStr) -> None:
     sample = np.arange(20, dtype=np.float32).reshape(10, 2)
 
     client.put_tensor("onnx-input", sample)
-    client.set_model("onnx-kmeans", model, "ONNX", device=device)
+    client.set_model("onnx-kmeans", model, "ONNX", device=device.value.upper())
     client.run_model(
         "onnx-kmeans", inputs=["onnx-input"], outputs=["onnx-labels", "onnx-transform"]
     )