CrayLabs · ankona · Oct 10, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 20, 2024
diff --git a/conftest.py b/conftest.py
@@ -93,6 +93,7 @@
 test_hostlist = None
 has_aprun = shutil.which("aprun") is not None
 
+
 def get_account() -> str:
     return test_account
 
@@ -459,15 +460,10 @@ def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None:
 
 @pytest.fixture(scope="function", autouse=True)
 def check_output_dir() -> None:
-    try:
-        global test_output_dirs
-        assert os.path.isdir(test_output_root)
-        assert len(os.listdir(test_output_root)) >= test_output_dirs
-        test_output_dirs = len(os.listdir(test_output_root))
-    except Exception:
-        # swallow error when the tests can't clean up test dirs
-        # and let the next run do the job.
-        ...
+    global test_output_dirs
+    assert os.path.isdir(test_output_root)
+    assert len(os.listdir(test_output_root)) >= test_output_dirs
+    test_output_dirs = len(os.listdir(test_output_root))
 
 
 @pytest.fixture

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
@@ -53,7 +53,7 @@
 from smartsim.log import get_logger, log_to_file
 from smartsim.protoclient import ProtoClient
 
-logger = get_logger("App", "DEBUG")
+logger = get_logger("App")
 
 
 CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False

diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py
@@ -135,7 +135,7 @@ def service_as_dragon_proc(
     args = parser.parse_args()
 
     connect_to_infrastructure()
-    ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"]
+    ddict_str = os.environ[BackboneFeatureStore.MLI_BACKBONE]
 
     backbone = BackboneFeatureStore.from_descriptor(ddict_str)
 

diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -95,14 +95,13 @@ def get_auth_token(request: DragonInstallRequest) -> t.Optional[Token]:
 def create_dotenv(dragon_root_dir: pathlib.Path, dragon_version: str) -> None:
     """Create a .env file with required environment variables for the Dragon runtime"""
     dragon_root = str(dragon_root_dir)
-    dragon_rut_dir = dragon_root
     dragon_inc_dir = dragon_root + "/include"
     dragon_lib_dir = dragon_root + "/lib"
     dragon_bin_dir = dragon_root + "/bin"
 
     dragon_vars = {
         "DRAGON_BASE_DIR": dragon_root,
-        "DRAGON_ROOT_DIR": dragon_rut_dir,
+        "DRAGON_ROOT_DIR": dragon_root,
         "DRAGON_INCLUDE_DIR": dragon_inc_dir,
         "DRAGON_LIB_DIR": dragon_lib_dir,
         "DRAGON_VERSION": dragon_version,

diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py
@@ -42,19 +42,21 @@ class Service(ABC):
     def __init__(
         self,
         as_service: bool = False,
-        cooldown: int = 0,
-        loop_delay: int = 0,
+        cooldown: float = 0,
+        loop_delay: float = 0,
         health_check_frequency: float = 0,
     ) -> None:
         """Initialize the ServiceHost
 
-        :param as_service: Determines if the host will run until shutdown criteria
-        are met or as a run-once instance
-        :param cooldown: Period of time to allow service to run before automatic
-        shutdown, in seconds. A non-zero, positive integer.
-        :param loop_delay: Delay between iterations of the event loop (in seconds)
-        :param health_check_frequency: Delay between calls to a
-        health check handler (in seconds)
+        :param as_service: Determines if the host runs continuously until
+        shutdown criteria are met, or executes the service lifecycle once and exits
+        :param cooldown: Period of time (in seconds) to allow the service to run
+         after a shutdown is permitted. Enables the service to avoid restarting if
+         new work is discovered. A value of 0 disables the cooldown.
+        :param loop_delay: Time (in seconds) between iterations of the event loop
+        :param health_check_frequency: Time (in seconds) between calls to a
+         health check handler. A value of 0 triggers the health check on every
+         iteration.
         """
         self._as_service = as_service
         """If the service should run until shutdown function returns True"""
@@ -64,8 +66,8 @@ def __init__(
         self._loop_delay = abs(loop_delay)
         """Forced delay between iterations of the event loop"""
         self._health_check_frequency = health_check_frequency
-        """The time (in seconds) between desired health checks. A health check
-        frequency of zero will never trigger the health check."""
+        """The time (in seconds) between desired health checks. Frequency of 0
+        will trigger the health check on every event loop iteration."""
         self._last_health_check = time.time()
         """The timestamp of the latest health check"""
 
@@ -135,7 +137,7 @@ def execute(self) -> None:
                     "Failure in event loop resulted in service termination"
                 )
 
-            if self._health_check_frequency > 0:
+            if self._health_check_frequency >= 0:
                 hc_elapsed = time.time() - self._last_health_check
                 if hc_elapsed >= self._health_check_frequency:
                     self._on_health_check()

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -48,14 +48,14 @@
 import dragon.native.machine as dragon_machine
 
 from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter
-from smartsim._core.mli.infrastructure.control.event_listener import (
+from smartsim._core.mli.infrastructure.control.listener import (
     ConsumerRegistrationListener,
 )
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
    BackboneFeatureStore,
 )
 from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim.error.errors import SmartSimError

 # pylint: enable=import-error
 # isort: on
@@ -82,8 +82,8 @@


 class DragonStatus(str, Enum):
    ERROR = "Error"
    RUNNING = "Running"

    def __str__(self) -> str:
        return self.value
@@ -157,9 +157,10 @@
    by threads spawned by it.
     """
 
     _DEFAULT_NUM_MGR_PER_NODE = 2
+    """The default number of manager processes for each feature store node"""
     _DEFAULT_MEM_PER_NODE = 256 * 1024**2
     """The default memory capacity to allocate for a feaure store node (in megabytes)"""
 
    def __init__(self, pid: int) -> None:
        self._pid = pid
@@ -194,12 +195,12 @@
        """Whether the server frontend should shut down when the backend does"""
        self._shutdown_initiation_time: t.Optional[float] = None
        """The time at which the server initiated shutdown"""
        self._cooldown_period = self._initialize_cooldown()
        """Time in seconds needed by the server to complete shutdown"""
        self._backbone: t.Optional[BackboneFeatureStore] = None
        """The backbone feature store"""
        self._listener: t.Optional[dragon_process.Process] = None
        """The standalone process executing the event consumer"""

        self._nodes: t.List["dragon_machine.Node"] = []
        """Node capability information for hosts in the allocation"""
@@ -264,8 +265,8 @@

        :returns: a status message
        """
        view = DragonBackendView(self)
        return "Dragon server backend update\n" f"{view.host_table}\n{view.step_table}"

    def _heartbeat(self) -> None:
        """Update the value of the last heartbeat to the current time."""
@@ -548,74 +549,83 @@
                self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
                self._group_infos[step_id].return_codes = [-9]
 
     def _create_backbone(self) -> BackboneFeatureStore:
         """
-        Create a BackboneFeatureStore if one does not exist.
+        Creates a BackboneFeatureStore if one does not exist. Updates
+        environment variables of this process to include the backbone
+        descriptor.
 
         :returns: The descriptor of the backbone feature store
         """
        if self._backbone is None:
            backbone_storage = create_ddict(
                len(self._hosts),
                self._DEFAULT_NUM_MGR_PER_NODE,
                self._DEFAULT_MEM_PER_NODE,
            )

            self._backbone = BackboneFeatureStore(
                backbone_storage, allow_reserved_writes=True
            )

            # put the backbone descriptor in the env vars
            os.environ.update(self._backbone.get_env())

        return self._backbone

    @staticmethod
    def _initialize_cooldown() -> int:
        """Load environment configuration and determine the correct cooldown
        period to apply to the backend process.

        :returns: The calculated cooldown (in seconds)
        """
        smartsim_config = get_config()
        return (
            smartsim_config.telemetry_frequency * 2 + 5
            if smartsim_config.telemetry_enabled
            else 5
        )

     def start_event_listener(
         self, cpu_affinity: list[int], gpu_affinity: list[int]
     ) -> dragon_process.Process:
+        """Start a standalone event listener.
+
+        :param cpu_affinity: The CPU affinity for the process
+        :param gpu_affinity: The CPU affinity for the process
+        :returns: The dragon Process managing the process
+        :raises SmartSimError: If the backbone is not provided
+        """
         if self._backbone is None:
             raise SmartSimError("Backbone feature store is not available")
 
        service = ConsumerRegistrationListener(
            self._backbone, 1.0, 2.0, as_service=True, health_check_frequency=90
        )

        options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
        local_policy = dragon_policy.Policy(
            placement=dragon_policy.Policy.Placement.HOST_NAME,
            host_name=socket.gethostname(),
            cpu_affinity=cpu_affinity,
            gpu_affinity=gpu_affinity,
        )
        process = dragon_process.Process(
            target=service.execute,
            args=[],
             cwd=os.getcwd(),
             env={
                 **os.environ,
-                **(self._backbone.get_env() if self._backbone is not None else {}),
+                **self._backbone.get_env(),
             },
             policy=local_policy,
             options=options,
            stderr=dragon_process.Popen.STDOUT,
            stdout=dragon_process.Popen.STDOUT,
        )
        process.start()
        return process

    @staticmethod
    def create_run_policy(
@@ -657,6 +667,7 @@
         )
 
     def _start_steps(self) -> None:
+        """Start all new steps created since the last update."""
         self._heartbeat()
 
         with self._queue_lock:
@@ -821,6 +832,9 @@
                     group_info.redir_workers = None
 
     def _update_shutdown_status(self) -> None:
+        """Query the status of running tasks and update the status
+        of any that have completed.
+        """
         self._heartbeat()
         with self._queue_lock:
             self._can_shutdown |= (
@@ -834,6 +848,9 @@
             )
 
     def _should_print_status(self) -> bool:
+        """Determine if status messages should be printed based off the last
+        update. Returns `True` to trigger prints, `False` otherwise.
+        """
         if self.current_time - self._last_update_time > 10:
             self._last_update_time = self.current_time
             return True
@@ -841,7 +858,7 @@

    def _update(self) -> None:
        """Trigger all update queries and update local state database"""
        self._create_backbone()

        self._stop_steps()
        self._start_steps()
@@ -850,8 +867,8 @@

    def _kill_all_running_jobs(self) -> None:
        with self._queue_lock:
            if self._listener and self._listener.is_alive:
                self._listener.kill()

            for step_id, group_info in self._group_infos.items():
                if group_info.status not in TERMINAL_STATUSES:
@@ -940,7 +957,7 @@
        self._backend = backend
        """A dragon backend used to produce the view"""

        logger.debug(self.host_desc)

    @property
    def host_desc(self) -> str:

diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py
@@ -245,6 +245,9 @@
 
         with open(config.dragon_dotenv, encoding="utf-8") as dot_env:
             for kvp in dot_env.readlines():
+                if not kvp:
+                    continue
+
                 # skip any commented lines
                 if not kvp.startswith("#"):
                     split = kvp.strip().split("=", maxsplit=1)

diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py
@@ -35,14 +35,6 @@
 
 logger = get_logger(__name__)
 
-DEFAULT_CHANNEL_BUFFER_SIZE = 500
-"""Maximum number of messages that can be buffered. DragonCommChannel will
-raise an exception if no clients consume messages before the buffer is filled."""
-
-LAST_OFFSET = 0
-"""The last offset used to create a local channel. This is used to avoid
-unnecessary retries when creating a local channel."""
-
 
 class DragonCommChannel(cch.CommChannelBase):
     """Passes messages by writing to a Dragon channel."""

diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py
@@ -51,7 +51,7 @@ def __init__(
     ) -> None:
         """Initialize the DragonFLIChannel instance.
 
-        :param fli_desc: The descriptor of the FLI channel to attach
+        :param fli_: The FLIInterface to use as the underlying communications channel
         :param sender_supplied: Flag indicating if the FLI uses sender-supplied streams
         :param buffer_size: Maximum number of sent messages that can be buffered
         """
@@ -79,7 +79,7 @@ def send(self, value: bytes, timeout: float = 0.001) -> None:
                 logger.debug(f"DragonFLIChannel {self.descriptor} sent message")
         except Exception as e:
             raise SmartSimError(
-                f"Error sending message: DragonFLIChannel {self.descriptor}"
+                f"Error sending via DragonFLIChannel {self.descriptor}"
             ) from e
 
     def recv(self, timeout: float = 0.001) -> t.List[bytes]:
@@ -99,6 +99,7 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]:
                     logger.debug(f"DragonFLIChannel {self.descriptor} received message")
                 except fli.FLIEOT:
                     eot = True
+                    logger.debug(f"DragonFLIChannel exhausted: {self.descriptor}")
                 except Exception as e:
                     raise SmartSimError(
                         f"Error receiving messages: DragonFLIChannel {self.descriptor}"
@@ -134,7 +135,8 @@ def from_descriptor(
 
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached DragonFLIChannel
-        :raises SmartSimError: If creation of DragonFLIChanenel fails
+        :raises SmartSimError: If creation of DragonFLIChannel fails
+        :raises ValueError: If the descriptor is invalid
         """
         if not descriptor:
             raise ValueError("Invalid descriptor provided")

diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py
@@ -30,10 +30,7 @@
 
 import dragon.channels as dch
 import dragon.fli as fli
-import dragon.infrastructure.facts as df
-import dragon.infrastructure.parameters as dp
 import dragon.managed_memory as dm
-import dragon.utils as du
 
 from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
@@ -54,10 +51,10 @@ def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str
 
     :param channel: The dragon channel to convert
     :returns: The descriptor string
-    :raises SmartSimError: If a dragon channel is not provided
+    :raises ValueError: If a dragon channel is not provided
     """
     if channel is None:
-        raise SmartSimError("Channel is not available to create a descriptor")
+        raise ValueError("Channel is not available to create a descriptor")
 
     serialized_ch = channel.serialize()
     return base64.b64encode(serialized_ch).decode("utf-8")
@@ -67,9 +64,11 @@ def pool_to_descriptor(pool: dm.MemoryPool) -> str:
     """Convert a dragon memory pool to a descriptor string.
 
     :param pool: The memory pool to convert
-    :returns: The descriptor string"""
+    :returns: The descriptor string
+    :raises ValueError: If a memory pool is not provided
+    """
     if pool is None:
-        raise SmartSimError("Memory pool is not available to create a descriptor")
+        raise ValueError("Memory pool is not available to create a descriptor")
 
     serialized_pool = pool.serialize()
     return base64.b64encode(serialized_pool).decode("utf-8")
@@ -82,6 +81,7 @@ def descriptor_to_fli(descriptor: str) -> "fli.FLInterface":
     :param descriptor: The descriptor of an FLI to attach to
     :returns: The attached dragon FLI
     :raises ValueError: If the descriptor is empty or incorrectly formatted
+    :raises SmartSimError: If attachment using the descriptor fails
     """
     if len(descriptor) < 1:
         raise ValueError("Descriptors may not be empty")
@@ -103,7 +103,8 @@ def descriptor_to_channel(descriptor: str) -> dch.Channel:
     :param descriptor: The descriptor of a channel to attach to
     :returns: The attached dragon Channel
     :raises ValueError: If the descriptor is empty or incorrectly formatted
-    :raises SmartSimError: If the descriptor does not attach to a channel"""
+    :raises SmartSimError: If attachment using the descriptor fails
+    """
     if len(descriptor) < 1:
         raise ValueError("Descriptors may not be empty")
 
@@ -122,43 +123,9 @@ def create_local(_capacity: int = 0) -> dch.Channel:
     direct calls to `dch.Channel.make_process_local()` to enable
     supplying a channel capacity.
 
-    :param capacity: The number of events the channel can buffer; uses the default
+    :param _capacity: The number of events the channel can buffer; uses the default
     buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied
     :returns: The instantiated channel
-    :raises SmartSimError: If unable to attach local channel
     """
-    # current implementation has a bug wrt MPI that must be fixed.
-    # falling back to `make_process_local` and disabling buffer size tests
-
-    # pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd))
-    # pool_descriptor = pool_to_descriptor(pool)
-    # channel: t.Optional[dch.Channel] = None
-    # offset = 0
-
-    # global LAST_OFFSET
-    # if LAST_OFFSET:
-    #     offset = LAST_OFFSET
-
-    # capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE
-
-    # while not channel:
-    #     # search for an open channel ID
-    #     offset += 1
-    #     channel_id = df.BASE_USER_MANAGED_CUID + offset
-    #     try:
-    #         channel = dch.Channel(mem_pool=pool, c_uid=channel_id, capacity=capacity)
-    #         LAST_OFFSET = offset
-    #         descriptor = channel_to_descriptor(channel)
-    #         logger.debug(
-    #             "Local channel created: "
-    #             f"{channel_id=}, {pool_descriptor=}, {capacity=}, {descriptor=}"
-    #         )
-    #     except dch.ChannelError as e:
-    #         if offset < 100:
-    #             logger.warning(f"Channnel id `{channel_id}` is not open. Retrying...")
-    #         else:
-    #             LAST_OFFSET = 0
-    #             logger.error(f"All attempts to attach local channel have failed")
-    #             raise SmartSimError("Failed to attach local channel") from e
     channel = dch.Channel.make_process_local()
     return channel
diff --git a/smartsim/_core/mli/infrastructure/comm/__init__.py b/smartsim/_core/mli/infrastructure/comm/__init__.py