From c6a145acad6da4aefe11afba194a6aa4df0534d5 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 17 Sep 2024 19:41:19 -0500
Subject: [PATCH 01/40] Squash event integration

---
 doc/changelog.md                              |   2 +-
 ex/high_throughput_inference/mock_app.py      | 109 +--
 .../standalone_worker_manager.py              |  31 +-
 .../_core/launcher/dragon/dragonBackend.py    | 151 +++-
 smartsim/_core/mli/comm/channel/channel.py    |  24 +-
 .../_core/mli/comm/channel/dragon_channel.py  |  23 +-
 smartsim/_core/mli/comm/channel/dragon_fli.py |  44 +-
 .../infrastructure/control/error_handling.py  |   2 +-
 .../mli/infrastructure/environment_loader.py  |   2 +-
 .../storage/backbone_feature_store.py         | 251 ++++++-
 .../storage/dragon_feature_store.py           |   8 +-
 .../infrastructure/storage/feature_store.py   |  30 +-
 .../_core/mli/infrastructure/worker/worker.py |  41 +-
 smartsim/_core/mli/message_handler.py         |  24 +-
 .../mli_schemas/data/data_references.capnp    |   4 +-
 .../data/data_references_capnp.pyi            |   4 +-
 .../mli/mli_schemas/request/request.capnp     |   2 +-
 .../mli/mli_schemas/request/request_capnp.pyi |   2 +-
 smartsim/log.py                               |   7 +-
 smartsim/protoclient.py                       | 285 +++++++
 tests/dragon/test_dragon_backend.py           | 174 +++++
 tests/dragon/test_environment_loader.py       |   2 +-
 tests/dragon/test_error_handling.py           |  80 +-
 tests/dragon/test_featurestore.py             | 338 +++++++++
 tests/dragon/test_featurestore_base.py        |  96 ++-
 tests/dragon/test_featurestore_integration.py |   3 +-
 tests/dragon/test_protoclient.py              | 231 ++++++
 tests/dragon/test_request_dispatcher.py       |  81 +-
 tests/dragon/test_worker_manager.py           | 557 ++++++++------
 tests/dragon/utils/channel.py                 |  18 +-
 tests/mli/channel.py                          |  18 +-
 tests/mli/test_integrated_torch_worker.py     |  24 +-
 tests/test_featurestore.py                    | 711 ++++++++++++++++++
 .../test_build_model_key.py                   |  10 +-
 .../test_output_descriptor.py                 |   2 +-
 tests/test_message_handler/test_request.py    |  38 +-
 tests/test_message_handler/test_response.py   |   4 +-
 37 files changed, 2874 insertions(+), 559 deletions(-)
 create mode 100644 smartsim/protoclient.py
 create mode 100644 tests/dragon/test_dragon_backend.py
 create mode 100644 tests/dragon/test_featurestore.py
 create mode 100644 tests/dragon/test_protoclient.py
 create mode 100644 tests/test_featurestore.py

diff --git a/doc/changelog.md b/doc/changelog.md
index 7d08c9376..b0e326d1f 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -13,12 +13,12 @@ Jump to:
 
 Description
 
+- Implement asynchronous notifications for shared data
 - Quick bug fix in _validate
 - Add helper methods to MLI classes
 - Update error handling for consistency
 - Parameterize installation of dragon package with `smart build`
 - Update docstrings
-- Implement asynchronous notifications for shared data
 - Filenames conform to snake case
 - Update SmartSim environment variables using new naming convention
 - Refactor `exception_handler`
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index dcc52296e..31195c7e6 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -37,18 +37,10 @@
 
 import argparse
 import io
-import numpy
-import os
-import time
+
 import torch
 
-from mpi4py import MPI
-from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
-    DragonFeatureStore,
-)
-from smartsim._core.mli.message_handler import MessageHandler
 from smartsim.log import get_logger
-from smartsim._core.utils.timings import PerfTimer
 
 torch.set_num_interop_threads(16)
 torch.set_num_threads(1)
@@ -56,79 +48,15 @@
 logger = get_logger("App")
 logger.info("Started app")
 
-CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False
+from collections import OrderedDict
 
-class ProtoClient:
-    def __init__(self, timing_on: bool):
-        comm = MPI.COMM_WORLD
-        rank = comm.Get_rank()
-        connect_to_infrastructure()
-        ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"]
-        self._ddict = DDict.attach(ddict_str)
-        self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor
-        to_worker_fli_str = None
-        while to_worker_fli_str is None:
-            try:
-                to_worker_fli_str = self._ddict["to_worker_fli"]
-                self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str)
-            except KeyError:
-                time.sleep(1)
-        self._from_worker_ch = Channel.make_process_local()
-        self._from_worker_ch_serialized = self._from_worker_ch.serialize()
-        self._to_worker_ch = Channel.make_process_local()
-
-        self.perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_")
-
-    def run_model(self, model: bytes | str, batch: torch.Tensor):
-        tensors = [batch.numpy()]
-        self.perf_timer.start_timings("batch_size", batch.shape[0])
-        built_tensor_desc = MessageHandler.build_tensor_descriptor(
-            "c", "float32", list(batch.shape)
-        )
-        self.perf_timer.measure_time("build_tensor_descriptor")
-        if isinstance(model, str):
-            model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor)
-        else:
-            model_arg = MessageHandler.build_model(model, "resnet-50", "1.0")
-        request = MessageHandler.build_request(
-            reply_channel=self._from_worker_ch_serialized,
-            model=model_arg,
-            inputs=[built_tensor_desc],
-            outputs=[],
-            output_descriptors=[],
-            custom_attributes=None,
-        )
-        self.perf_timer.measure_time("build_request")
-        request_bytes = MessageHandler.serialize_request(request)
-        self.perf_timer.measure_time("serialize_request")
-        with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
-            to_sendh.send_bytes(request_bytes)
-            self.perf_timer.measure_time("send_request")
-            for tensor in tensors:
-                to_sendh.send_bytes(tensor.tobytes()) #TODO NOT FAST ENOUGH!!!
-        self.perf_timer.measure_time("send_tensors")
-        with self._from_worker_ch.recvh(timeout=None) as from_recvh:
-            resp = from_recvh.recv_bytes(timeout=None)
-            self.perf_timer.measure_time("receive_response")
-            response = MessageHandler.deserialize_response(resp)
-            self.perf_timer.measure_time("deserialize_response")
-            # list of data blobs? recv depending on the len(response.result.descriptors)?
-            data_blob: bytes = from_recvh.recv_bytes(timeout=None)
-            self.perf_timer.measure_time("receive_tensor")
-            result = torch.from_numpy(
-                numpy.frombuffer(
-                    data_blob,
-                    dtype=str(response.result.descriptors[0].dataType),
-                )
-            )
-            self.perf_timer.measure_time("deserialize_tensor")
+from smartsim.log import get_logger, log_to_file
+from smartsim.protoclient import ProtoClient
 
-        self.perf_timer.end_timings()
-        return result
+logger = get_logger("App", "DEBUG")
 
-    def set_model(self, key: str, model: bytes):
-        self._ddict[key] = model
 
+CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False
 
 
 class ResNetWrapper:
@@ -151,6 +79,7 @@ def model(self):
     def name(self):
         return self._name
 
+
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser("Mock application")
@@ -160,30 +89,38 @@ def name(self):
 
     resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt")
 
-    client = ProtoClient(timing_on=True)
-    client.set_model(resnet.name, resnet.model)
+    client = ProtoClient(timing_on=True, wait_timeout=0)
+    # client.set_model(resnet.name, resnet.model)
 
     if CHECK_RESULTS_AND_MAKE_ALL_SLOWER:
         # TODO: adapt to non-Nvidia devices
         torch_device = args.device.replace("gpu", "cuda")
-        pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to(torch_device)
+        pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to(
+            torch_device
+        )
 
     TOTAL_ITERATIONS = 100
 
-    for log2_bsize in range(args.log_max_batchsize+1):
+    for log2_bsize in range(args.log_max_batchsize + 1):
         b_size: int = 2**log2_bsize
         logger.info(f"Batch size: {b_size}")
-        for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)):
+        for iteration_number in range(TOTAL_ITERATIONS + int(b_size == 1)):
             logger.info(f"Iteration: {iteration_number}")
             sample_batch = resnet.get_batch(b_size)
             remote_result = client.run_model(resnet.name, sample_batch)
             logger.info(client.perf_timer.get_last("total_time"))
             if CHECK_RESULTS_AND_MAKE_ALL_SLOWER:
                 local_res = pt_model(sample_batch.to(torch_device))
-                err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to(torch_device)-torch.flatten(local_res), ord=1).cpu()
+                err_norm = torch.linalg.vector_norm(
+                    torch.flatten(remote_result).to(torch_device)
+                    - torch.flatten(local_res),
+                    ord=1,
+                ).cpu()
                 res_norm = torch.linalg.vector_norm(remote_result, ord=1).item()
                 local_res_norm = torch.linalg.vector_norm(local_res, ord=1).item()
-                logger.info(f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}")
+                logger.info(
+                    f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}"
+                )
                 torch.cuda.synchronize()
 
-    client.perf_timer.print_timings(to_file=True)
\ No newline at end of file
+    client.perf_timer.print_timings(to_file=True)
diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py
index feb1af1ae..e34df0ccd 100644
--- a/ex/high_throughput_inference/standalone_worker_manager.py
+++ b/ex/high_throughput_inference/standalone_worker_manager.py
@@ -37,6 +37,7 @@
 from dragon.globalservices.api_setup import connect_to_infrastructure
 from dragon.managed_memory import MemoryPool
 from dragon.utils import b64decode, b64encode
+
 # pylint enable=import-error
 
 # isort: off
@@ -45,6 +46,7 @@
 import argparse
 import base64
 import multiprocessing as mp
+import optparse
 import os
 import pickle
 import socket
@@ -53,26 +55,24 @@
 import typing as t
 
 import cloudpickle
-import optparse
-import os
 
 from smartsim._core.entrypoints.service import Service
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
-from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
-    DragonFeatureStore,
-)
 from smartsim._core.mli.infrastructure.control.request_dispatcher import (
     RequestDispatcher,
 )
 from smartsim._core.mli.infrastructure.control.worker_manager import WorkerManager
 from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
+from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys
 from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase
-
 from smartsim.log import get_logger
 
 logger = get_logger("Worker Manager Entry Point")
@@ -85,7 +85,6 @@
 logger.info(f"CPUS: {os.cpu_count()}")
 
 
-
 def service_as_dragon_proc(
     service: Service, cpu_affinity: list[int], gpu_affinity: list[int]
 ) -> dragon_process.Process:
@@ -108,8 +107,6 @@ def service_as_dragon_proc(
     )
 
 
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Worker Manager")
     parser.add_argument(
@@ -144,26 +141,24 @@ def service_as_dragon_proc(
 
     connect_to_infrastructure()
     ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"]
-    ddict = DDict.attach(ddict_str)
+
+    backbone = BackboneFeatureStore.from_descriptor(ddict_str)
 
     to_worker_channel = Channel.make_process_local()
     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-    to_worker_fli_serialized = to_worker_fli.serialize()
-    ddict["to_worker_fli"] = to_worker_fli_serialized
+    to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, True)
+
+    backbone.worker_queue = to_worker_fli_comm_channel.descriptor
 
     arg_worker_type = cloudpickle.loads(
         base64.b64decode(args.worker_class.encode("ascii"))
     )
 
-    dfs = DragonFeatureStore(ddict)
-    comm_channel = DragonFLIChannel(to_worker_fli_serialized)
-
-    descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
-    os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
+    os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=DragonCommChannel,
+        callback_factory=DragonCommChannel.from_descriptor,
         queue_factory=DragonFLIChannel.from_descriptor,
     )
 
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 7526af14a..0f8121ab5 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -26,6 +26,7 @@
 import collections
 import functools
 import itertools
+import multiprocessing as mp
 import time
 import typing as t
 from dataclasses import dataclass, field
@@ -34,18 +35,27 @@
 
 from tabulate import tabulate
 
-# pylint: disable=import-error
+# pylint: disable=import-error,C0302,R0915,R6301
 # isort: off
 import dragon.data.ddict.ddict as dragon_ddict
 import dragon.infrastructure.connection as dragon_connection
 import dragon.infrastructure.policy as dragon_policy
 import dragon.infrastructure.process_desc as dragon_process_desc
-import dragon.native.group_state as dragon_group_state
+
+# import dragon.native.group_state as dragon_group_state
 import dragon.native.process as dragon_process
 import dragon.native.process_group as dragon_process_group
 import dragon.native.machine as dragon_machine
 
 from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+    EventBase,
+    # EventBroadcaster,
+    EventCategory,
+    EventConsumer,
+)
 
 # pylint: enable=import-error
 # isort: on
@@ -72,8 +82,8 @@
 
 
 class DragonStatus(str, Enum):
-    ERROR = str(dragon_group_state.Error())
-    RUNNING = str(dragon_group_state.Running())
+    ERROR = "Error"  # str(dragon_group_state.Error())
+    RUNNING = "Running"  # str(dragon_group_state.Running())
 
     def __str__(self) -> str:
         return self.value
@@ -187,8 +197,15 @@ def __init__(self, pid: int) -> None:
             else 5
         )
         """Time in seconds needed to server to complete shutdown"""
-        self._infra_ddict: t.Optional[dragon_ddict.DDict] = None
-
+        self._backbone: t.Optional[BackboneFeatureStore] = None
+        """The backbone feature store"""
+        self._event_consumer: t.Optional[EventConsumer] = None
+        """A listener registered to listen for new consumers and update the shared
+        consumer registrations list"""
+        self._event_consumer_process: t.Optional[mp.Process] = None
+        """The process executing the event consumers `listen` method"""
+
+        """An event consumer for receiving events from MLI resources"""
         self._nodes: t.List["dragon_machine.Node"] = []
         """Node capability information for hosts in the allocation"""
         self._hosts: t.List[str] = []
@@ -539,21 +556,113 @@ def _stop_steps(self) -> None:
                 self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
                 self._group_infos[step_id].return_codes = [-9]
 
-    @property
-    def infra_ddict(self) -> str:
-        """Create a Dragon distributed dictionary and return its
-        serialized descriptor
+    def _create_backbone(self) -> BackboneFeatureStore:
+        """
+        Create a BackboneFeatureStore if one does not exist.
+
+        :returns: The descriptor of the backbone feature store
         """
-        if self._infra_ddict is None:
-            logger.info("Creating DDict")
-            self._infra_ddict = dragon_ddict.DDict(
+        if self._backbone is None:
+            logger.info("Creating backbone storage DDict")
+            backbone_storage = dragon_ddict.DDict(
                 n_nodes=len(self._hosts), total_mem=len(self._hosts) * 1024**3
             )  # todo: parametrize
-            logger.info("Created DDict")
-            self._infra_ddict["creation"] = str(time.time())
-            logger.info(self._infra_ddict["creation"])
+            logger.info("Created backbone storage DDict")
+            self._backbone = BackboneFeatureStore(
+                backbone_storage, allow_reserved_writes=True
+            )
+            logger.info(self._backbone.creation_date)
+
+        return self._backbone
+
+    def _on_consumer_created(self, event: EventBase) -> None:
+        """Event handler for"""
+        logger.warning(f"Unhandled event received: {event}")
+
+    def _bootstrap_event_listeners(
+        self, backbone: BackboneFeatureStore, consumer: EventConsumer
+    ) -> None:
+        """Update the list of notification channels registered in the backbone.
+
+        :param backbone: The backbone feature store to update"""
+        # Copy the consumer list so a backend restart doesn't clear registrations
+        notify_descriptors = list(backbone.notification_channels)
 
-        return str(self._infra_ddict.serialize())
+        # Update directly to avoid SEND/ACK pattern
+        notify_descriptors.append(consumer.descriptor)
+        # consumer.register() # this will loop infinitely waiting for itself
+
+        backbone.notification_channels = notify_descriptors
+
+    def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer:
+        """
+        Create an event publisher and event consumer for communicating with
+        other MLI resources.
+
+        :param backbone: The backbone feature store used by the MLI backend. NOTE:
+        passing backbone as a parameter to ensure the backbone is initialized before
+        attempting to connect any eventing clients.
+        :returns: The newly created EventConsumer instance
+        """
+        # if self._event_producer is None:
+        #     logger.info("Creating event publisher")
+        #     # todo: ensure DCC.from_descriptor and not DCC.from_local
+        #     self._event_producer =
+        # EventBroadcaster(backbone, DragonCommChannel.from_descriptor)
+        #     logger.info("Created event publisher")
+
+        if self._event_consumer is None:
+            logger.info("Creating event consumer")
+            event_channel = DragonCommChannel.from_local()
+            consumer = EventConsumer(
+                event_channel,
+                backbone,
+                [EventCategory.CONSUMER_CREATED],
+                name="BackendConsumerRegistrar",
+                event_handler=self._on_consumer_created,
+            )
+
+            # self._backbone.backend_channel =
+            # consumer.descriptor # i want to get rid of this extra channel
+            # self._bootstrap_event_listeners(backbone, consumer)
+            self._event_consumer = consumer
+
+            # options = dragon_process_desc.
+            # ProcessOptions(make_inf_channels=True) # what is this!?
+            # grp_consumer = dragon_process_group.ProcessGroup(
+            #     restart=False, pmi_enabled=False
+            # )
+            # self._event_consumer_process = dragon_process.ProcessTemplate(
+            #     target=self._event_consumer.listen,
+            #     # args=request.exe_args,
+            #     # cwd=request.path,
+            #     env={
+            #         # **request.current_env,
+            #         # **request.env,
+            #         **self._backbone.get_env(),
+            #     },
+            #     stdout=dragon_process.Popen.PIPE,
+            #     stderr=dragon_process.Popen.PIPE,
+            #     # policy=local_policy,
+            #     options=options,
+            # )
+            # grp_consumer.add(self._event_consumer_process)
+            # # self._event_consumer_process =
+            # mp.Process(target=self._event_consumer.listen)
+            # # self._event_consumer_process.start()
+            # grp_consumer.init()
+            # grp_consumer.start()
+
+            logger.info("Created event consumer")
+
+        return self._event_consumer
+
+    def _start_eventing_listeners(self) -> None:
+        if self._event_consumer:
+            self._event_consumer_process = mp.Process(
+                target=self._event_consumer.listen
+            )
+            self._event_consumer_process.start()
 
     @staticmethod
     def create_run_policy(
@@ -596,6 +705,9 @@ def create_run_policy(
 
     def _start_steps(self) -> None:
         self._heartbeat()
+        backbone = self._create_backbone()
+        self._create_eventing(backbone)
+
         with self._queue_lock:
             started = []
             for step_id, request in self._queued_steps.items():
@@ -622,7 +734,7 @@ def _start_steps(self) -> None:
                         env={
                             **request.current_env,
                             **request.env,
-                            "_SMARTSIM_INFRA_BACKBONE": self.infra_ddict,
+                            **backbone.get_env(),
                         },
                         stdout=dragon_process.Popen.PIPE,
                         stderr=dragon_process.Popen.PIPE,
@@ -778,6 +890,9 @@ def _should_print_status(self) -> bool:
 
     def _update(self) -> None:
         """Trigger all update queries and update local state database"""
+        backbone = self._create_backbone()
+        self._create_eventing(backbone)
+
         self._stop_steps()
         self._start_steps()
         self._refresh_statuses()
diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py
index 9a12e4c8d..90d81cb9b 100644
--- a/smartsim/_core/mli/comm/channel/channel.py
+++ b/smartsim/_core/mli/comm/channel/channel.py
@@ -26,6 +26,7 @@
 
 import base64
 import typing as t
+import uuid
 from abc import ABC, abstractmethod
 
 from smartsim.log import get_logger
@@ -36,12 +37,19 @@
 class CommChannelBase(ABC):
     """Base class for abstracting a message passing mechanism"""
 
-    def __init__(self, descriptor: t.Union[str, bytes]) -> None:
+    def __init__(
+        self,
+        descriptor: str,
+        name: t.Optional[str] = None,
+    ) -> None:
         """Initialize the CommChannel instance.
 
         :param descriptor: Channel descriptor
         """
         self._descriptor = descriptor
+        """An opaque identifier used to connect to an underlying communication channel"""
+        self._name = name or str(uuid.uuid4())
+        """A user-friendly identifier for channel-related logging"""
 
     @abstractmethod
     def send(self, value: bytes, timeout: float = 0) -> None:
@@ -61,11 +69,19 @@ def recv(self, timeout: float = 0) -> t.List[bytes]:
         """
 
     @property
-    def descriptor(self) -> bytes:
+    def descriptor(self) -> str:
         """Return the channel descriptor for the underlying dragon channel.
 
         :returns: Byte encoded channel descriptor
         """
-        if isinstance(self._descriptor, str):
-            return base64.b64decode(self._descriptor.encode("utf-8"))
         return self._descriptor
+
+    @property
+    def decoded_descriptor(self) -> bytes:
+        """Return the descriptor decoded from a string into bytes"""
+        return base64.b64decode(self._descriptor.encode("utf-8"))
+
+    def __str__(self) -> str:
+        """Build a string representation of the channel useful for printing"""
+        classname = type(self).__class__.__name__
+        return f"{classname}('{self._name}', '{self._descriptor}')"
diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py
index 1363c0d67..a22ebe952 100644
--- a/smartsim/_core/mli/comm/channel/dragon_channel.py
+++ b/smartsim/_core/mli/comm/channel/dragon_channel.py
@@ -130,15 +130,17 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]:
         with self._channel.recvh(timeout=timeout) as recvh:
             messages: t.List[bytes] = []
 
+            # todo: consider that this could (under load) never exit. do we need
+            # to configure a maximum number to pull at once?
             try:
                 message_bytes = recvh.recv_bytes(timeout=timeout)
                 messages.append(message_bytes)
-                logger.debug(f"DragonCommChannel {self.descriptor!r} received message")
+                logger.debug(f"DragonCommChannel {self.descriptor} received message")
             except dch.ChannelEmpty:
                 # emptied the queue, ok to swallow this ex
-                logger.debug(f"DragonCommChannel exhausted: {self.descriptor!r}")
+                logger.debug(f"DragonCommChannel exhausted: {self.descriptor}")
             except dch.ChannelRecvTimeout as ex:
-                logger.debug(f"Timeout exceeded on channel.recv: {self.descriptor!r}")
+                logger.debug(f"Timeout exceeded on channel.recv: {self.descriptor}")
 
             return messages
 
@@ -169,8 +171,7 @@ def from_descriptor(
         :param descriptor: The descriptor that uniquely identifies the resource. Output
         from `descriptor_string` is correctly encoded.
         :returns: An attached DragonCommChannel
-        :raises SmartSimError: If creation of comm channel fails
-        """
+        :raises SmartSimError: If creation of comm channel fails"""
         try:
             utf8_descriptor: t.Union[str, bytes] = descriptor
             if isinstance(descriptor, str):
@@ -186,3 +187,15 @@ def from_descriptor(
             raise SmartSimError(
                 f"Failed to create dragon comm channel: {descriptor!r}"
             ) from ex
+
+    @classmethod
+    def from_local(cls, _descriptor: t.Optional[str] = None) -> "DragonCommChannel":
+        """A factory method that creates a local channel instance
+
+        :returns: An attached DragonCommChannel"""
+        try:
+            channel = dch.Channel.make_process_local()
+            return DragonCommChannel(channel)
+        except:
+            logger.error(f"Failed to create local dragon comm channel", exc_info=True)
+            raise
diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py
index 84d809c8a..325f6b779 100644
--- a/smartsim/_core/mli/comm/channel/dragon_fli.py
+++ b/smartsim/_core/mli/comm/channel/dragon_fli.py
@@ -50,7 +50,7 @@ class DragonFLIChannel(cch.CommChannelBase):
 
     def __init__(
         self,
-        fli_desc: bytes,
+        fli_: fli.FLInterface,
         sender_supplied: bool = True,
         buffer_size: int = 0,
     ) -> None:
@@ -60,9 +60,11 @@ def __init__(
         :param sender_supplied: Flag indicating if the FLI uses sender-supplied streams
         :param buffer_size: Maximum number of sent messages that can be buffered
         """
-        super().__init__(fli_desc)
-        self._fli: "fli" = fli.FLInterface.attach(fli_desc)
-        self._channel: t.Optional["dch"] = (
+        descriptor = base64.b64encode(fli_.serialize()).decode("utf-8")
+        super().__init__(descriptor)
+
+        self._fli = fli_
+        self._channel: t.Optional["dch.Channel"] = (
             create_local(buffer_size) if sender_supplied else None
         )
 
@@ -107,6 +109,33 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]:
                     ) from e
         return messages
 
+    @classmethod
+    def _string_descriptor_to_fli(cls, descriptor: str) -> "fli.FLInterface":
+        """Helper method to convert a string-safe, encoded descriptor back
+        into its original byte format"""
+        descriptor_ = base64.b64decode(descriptor.encode("utf-8"))
+        return fli.FLInterface.attach(descriptor_)
+
+    @classmethod
+    def from_sender_supplied_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "DragonFLIChannel":
+        """A factory method that creates an instance from a descriptor string
+
+        :param descriptor: the descriptor of the main FLI channel to attach
+        :returns: An attached DragonFLIChannel"""
+        try:
+            return DragonFLIChannel(
+                fli_=cls._string_descriptor_to_fli(descriptor),
+                sender_supplied=True,
+            )
+        except:
+            logger.error(
+                f"Error while creating sender supplied DragonFLIChannel: {descriptor}"
+            )
+            raise
+
     @classmethod
     def from_descriptor(
         cls,
@@ -118,10 +147,13 @@ def from_descriptor(
         :returns: An attached DragonFLIChannel
         :raises SmartSimError: If creation of DragonFLIChanenel fails
         """
+        if not descriptor:
+            raise ValueError("Invalid descriptor provided")
+
         try:
             return DragonFLIChannel(
-                fli_desc=base64.b64decode(descriptor),
-                sender_supplied=True,
+                fli_=cls._string_descriptor_to_fli(descriptor),
+                sender_supplied=False,
             )
         except Exception as e:
             raise SmartSimError(
diff --git a/smartsim/_core/mli/infrastructure/control/error_handling.py b/smartsim/_core/mli/infrastructure/control/error_handling.py
index 8961cac54..a75f533a3 100644
--- a/smartsim/_core/mli/infrastructure/control/error_handling.py
+++ b/smartsim/_core/mli/infrastructure/control/error_handling.py
@@ -48,7 +48,7 @@ def build_failure_reply(status: "Status", message: str) -> ResponseBuilder:
     return MessageHandler.build_response(
         status=status,
         message=message,
-        result=[],
+        result=None,
         custom_attributes=None,
     )
 
diff --git a/smartsim/_core/mli/infrastructure/environment_loader.py b/smartsim/_core/mli/infrastructure/environment_loader.py
index 02043fbd8..e67cc469a 100644
--- a/smartsim/_core/mli/infrastructure/environment_loader.py
+++ b/smartsim/_core/mli/infrastructure/environment_loader.py
@@ -42,7 +42,7 @@ class EnvironmentConfigLoader:
     def __init__(
         self,
         featurestore_factory: t.Callable[[str], FeatureStore],
-        callback_factory: t.Callable[[bytes], CommChannelBase],
+        callback_factory: t.Callable[[str], CommChannelBase],
         queue_factory: t.Callable[[str], CommChannelBase],
     ) -> None:
         """Initialize the config loader instance with the factories necessary for
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index b6655bded..0db41f77a 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -24,7 +24,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import base64
 import enum
+import itertools
 import pickle
 import time
 import typing as t
@@ -39,6 +41,7 @@
 # isort: on
 
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
@@ -48,6 +51,14 @@
 logger = get_logger(__name__)
 
 
+def byte_descriptor_to_string(descriptor: bytes) -> str:
+    return base64.b64encode(descriptor).decode("utf-8")
+
+
+def string_descriptor_to_byte(descriptor: str) -> bytes:
+    return base64.b64decode(descriptor.encode("utf-8"))
+
+
 # todo: did i create an arms race where a developer just grabs the backbone
 # and passes it wherever they need a FeatureStore?
 class BackboneFeatureStore(DragonFeatureStore):
@@ -55,9 +66,15 @@ class BackboneFeatureStore(DragonFeatureStore):
     information stored in the MLI backbone feature store."""
 
     MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS"
+    MLI_BACKEND_CONSUMER = "_SMARTIM_MLI_BACKEND_CONSUMER"
+    MLI_WORKER_QUEUE = "to_worker_fli"
+    MLI_BACKBONE = "_SMARTSIM_INFRA_BACKBONE"
+    _CREATED_ON = "creation"
 
     def __init__(
-        self, storage: "dragon_ddict.DDict", allow_reserved_writes: bool = False
+        self,
+        storage: "dragon_ddict.DDict",
+        allow_reserved_writes: bool = False,
     ) -> None:
         """Initialize the DragonFeatureStore instance.
 
@@ -68,6 +85,17 @@ def __init__(
         super().__init__(storage)
         self._enable_reserved_writes = allow_reserved_writes
 
+        if self._CREATED_ON not in self:
+            self._record_creation_date()
+
+    @property
+    def wait_timeout(self) -> float:
+        return self._wait_timeout
+
+    @wait_timeout.setter
+    def wait_timeout(self, value: float) -> None:
+        self._wait_timeout = value
+
     @property
     def notification_channels(self) -> t.Sequence[str]:
         """Retrieve descriptors for all registered MLI notification channels.
@@ -87,6 +115,135 @@ def notification_channels(self, values: t.Sequence[str]) -> None:
         """
         self[self.MLI_NOTIFY_CONSUMERS] = ",".join([str(value) for value in values])
 
+    @property
+    def backend_channel(self) -> t.Optional[str]:
+        """Retrieve the channel descriptor exposed by the MLI backend for events
+
+        :returns: a stringified channel descriptor"""
+        if self.MLI_NOTIFY_CONSUMERS in self:
+            return str(self[self.MLI_NOTIFY_CONSUMERS])
+        return None
+
+    @backend_channel.setter
+    def backend_channel(self, value: str) -> None:
+        """Set the channel exposed by the MLI backend for events
+
+        :param value: a stringified channel descriptor"""
+        self[self.MLI_NOTIFY_CONSUMERS] = value
+
+    @property
+    def worker_queue(self) -> t.Optional[str]:
+        """Retrieve the channel descriptor exposed by the MLI
+        backend to send work to an MLI worker manager instance
+
+        :returns: a stringified channel descriptor"""
+        if self.MLI_WORKER_QUEUE in self:
+            return str(self[self.MLI_WORKER_QUEUE])
+        return None
+
+    @worker_queue.setter
+    def worker_queue(self, value: str) -> None:
+        """Set the channel descriptor exposed by the MLI
+        backend to send work to an MLI worker manager instance
+
+        :param value: a stringified channel descriptor"""
+        self[self.MLI_WORKER_QUEUE] = value
+
+    @property
+    def creation_date(self) -> str:
+        """Return the creation date for the backbone feature store"""
+        return str(self[self._CREATED_ON])
+
+    def _record_creation_date(self) -> None:
+        """Write the creation timestamp to the feature store"""
+        if self._CREATED_ON not in self:
+            if not self._allow_reserved_writes:
+                logger.warning(
+                    "Recorded creation from a write-protected backbone instance"
+                )
+            self[self._CREATED_ON] = str(time.time())
+
+    @classmethod
+    def from_writable_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "BackboneFeatureStore":
+        """A factory method that creates an instance from a descriptor string
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached DragonFeatureStore
+        :raises SmartSimError: if attachment to DragonFeatureStore fails"""
+        try:
+            return BackboneFeatureStore(dragon_ddict.DDict.attach(descriptor), True)
+        except Exception as ex:
+            logger.error(f"Error creating dragon feature store: {descriptor}")
+            raise SmartSimError(
+                f"Error creating dragon feature store: {descriptor}"
+            ) from ex
+
+    @staticmethod
+    def _check_wait_timeout(
+        start_time: float, timeout: float, indicators: t.Dict[str, bool]
+    ) -> None:
+        """Perform timeout verification
+
+        :param start_time: the start time to use for elapsed calculation
+        :param timeout: the timeout (in seconds)
+        :param indicators: latest retrieval status for requested keys"""
+        elapsed = time.time() - start_time
+        if timeout and elapsed > timeout:
+            raise SmartSimError(
+                f"Timeout retrieving all keys from backbone: {indicators}"
+            )
+
+    def wait_for(
+        self, keys: t.List[str], timeout: float = 0
+    ) -> t.Dict[str, t.Union[str, bytes, None]]:
+        """Perform a blocking wait until all specified keys have been found
+        in the backbone
+
+        :param keys: The required collection of keys to retrieve
+        :param timeout: The maximum wait time in seconds. Overrides class level setting
+        """
+
+        to_check = list(keys)
+        was_found = [False for _ in to_check]  # add test ensuring dupes are handled..
+        values: t.List[t.Union[str, bytes, None]] = [None for _ in to_check]
+
+        backoff: t.List[float] = [0.1, 0.5, 1, 2, 4, 8]
+        backoff_iter = itertools.cycle(backoff)
+        start_time = time.time()
+
+        while not all(was_found):
+            delay = next(backoff_iter)
+
+            for index, key in enumerate(to_check):
+                if was_found[index]:
+                    continue
+
+                try:
+                    values[index] = self[key]
+                    was_found[index] = True
+                except KeyError:
+                    if delay == backoff[-1]:
+                        logger.debug(f"Re-attempting `{key}` retrieval in {delay}s")
+
+            if all(was_found):
+                continue
+
+            self._check_wait_timeout(
+                start_time, timeout, dict(zip(to_check, was_found))
+            )
+
+            time.sleep(delay)
+
+        return dict(zip(keys, values))
+
+    def get_env(self) -> t.Dict[str, str]:
+        """Returns a dictionary populated with environment variables necessary to
+        connect a process to the existing backbone instance."""
+        return {self.MLI_BACKBONE: self.descriptor}
+
 
 class EventCategory(str, enum.Enum):
     """Predefined event types raised by SmartSim backend."""
@@ -126,21 +283,26 @@ class OnCreateConsumer(EventBase):
 
     descriptor: str
     """Descriptor of the comm channel exposed by the consumer"""
+    filters: t.List[EventCategory]
+    """The collection of filters indicating messages of interest to this consumer"""
 
-    def __init__(self, descriptor: str) -> None:
+    def __init__(self, descriptor: str, filters: t.Sequence[EventCategory]) -> None:
         """Initialize the OnCreateConsumer event.
 
         :param descriptor: Descriptor of the comm channel exposed by the consumer
+        :param descriptor: Collection of filters indicating messages of interest
         """
         super().__init__(EventCategory.CONSUMER_CREATED, str(uuid.uuid4()))
         self.descriptor = descriptor
+        self.filters = list(filters)
 
     def __str__(self) -> str:
         """Convert the event to a string.
 
         :returns: A string representation of this instance
         """
-        return f"{str(super())}|{self.descriptor}"
+        _filters = ",".join(self.filters)
+        return f"{str(super())}|{self.descriptor}|{_filters}"
 
 
 class OnWriteFeatureStore(EventBase):
@@ -181,6 +343,36 @@ def send(self, event: EventBase, timeout: float = 0.001) -> int:
         """
 
 
+class EventSender:
+    """An event publisher that performs publishing of system events to a
+    single endpoint"""
+
+    def __init__(
+        self,
+        backbone: BackboneFeatureStore,
+        channel: t.Optional[CommChannelBase],
+    ) -> None:
+        """Initialize the instance"""
+        self._backbone = backbone
+        self._channel: t.Optional[CommChannelBase] = channel
+
+    def send(self, event: EventBase) -> int:
+        """The send operation"""
+        if self._channel is None:
+            # self._channel = self._channel_factory(event)
+            raise Exception("No channel to send on")
+        num_sent = 0
+
+        try:
+            event_bytes = bytes(event)
+            self._channel.send(event_bytes)
+            num_sent += 1
+        except Exception as ex:
+            raise SmartSimError(f"Failed broadcast to channel: {self._channel}") from ex
+
+        return num_sent
+
+
 class EventBroadcaster:
     """Performs fan-out publishing of system events."""
 
@@ -353,6 +545,8 @@ def __init__(
         backbone: BackboneFeatureStore,
         filters: t.Optional[t.List[EventCategory]] = None,
         batch_timeout: t.Optional[float] = None,
+        name: t.Optional[str] = None,
+        event_handler: t.Optional[t.Callable[[EventBase], None]] = None,
     ) -> None:
         """Initialize the EventConsumer instance.
 
@@ -371,6 +565,15 @@ def __init__(
         self._backbone = backbone
         self._global_filters = filters or []
         self._global_timeout = batch_timeout or 1.0
+        self._name = name
+        self._event_handler = event_handler
+
+    @property
+    def descriptor(self) -> str:
+        """The descriptor of the underlying comm channel where events are received
+
+        :returns: The comm channel descriptor"""
+        return self._comm_channel.descriptor
 
     def receive(
         self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0
@@ -417,3 +620,45 @@ def receive(
                 break
 
         return messages
+
+    def register(self) -> t.Generator[bool, None, None]:
+        """Send an event to register this consumer as a listener"""
+        awaiting_confirmation = True
+        descriptor = self._comm_channel.descriptor
+        backoffs = itertools.cycle((0.1, 0.5, 1.0, 2.0, 4.0, 8.0))
+        event = OnCreateConsumer(descriptor, self._global_filters)
+
+        # we're going to sit in this loop to wait for the backbone to get
+        # updated with the registration (to avoid SEND/ACK)
+        while awaiting_confirmation:
+            registered_channels = self._backbone.notification_channels
+            # todo: this should probably be descriptor_string? maybe i need to
+            # get rid of descriptor as bytes or just make desc_string required in ABC
+            if descriptor in registered_channels:
+                awaiting_confirmation = False
+
+            yield not awaiting_confirmation
+            time.sleep(next(backoffs))
+
+            # if backend_descriptor := self._backbone.backend_channel:
+            #     backend_channel = DragonCommChannel.
+            # from_descriptor(backend_descriptor)
+            #     backend = EventSender(self._backbone, backend_channel)
+            #     backend.send(event)
+
+            # broadcast that this consumer is now ready to mingle
+            publisher = EventBroadcaster(self._backbone, DragonCommChannel.from_local)
+            publisher.send(event, timeout=0.1)
+
+    # def register_callback(self, callback: t.Callable[[EventBase], None]) -> None: ...
+
+    def listen(self) -> None:
+        """Function to handle incoming events"""
+        print("starting listener...")
+
+        while True:
+            print("awaiting new message")
+            incoming_messages = self.receive()
+            for message in incoming_messages:
+                if self._event_handler:
+                    self._event_handler(message)
diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
index d7b37ffe6..0256b1a51 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
@@ -46,13 +46,14 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None:
         """Initialize the DragonFeatureStore instance.
 
         :param storage: A distributed dictionary to be used as the underlying
-        storage mechanism of the feature store
-        """
+        storage mechanism of the feature store"""
         if isinstance(storage, dragon_ddict.DDict):
             descriptor = str(storage.serialize())
         else:
             descriptor = "not-set"
 
+        # todo: follow up and ensure this descriptor is also encoded/decoded
+        # in a string-safe way here & in `from_descriptor`
         super().__init__(descriptor)
         self._storage: t.Dict[str, t.Union[str, bytes]] = storage
 
@@ -97,7 +98,8 @@ def from_descriptor(
         :raises SmartSimError: If attachment to DragonFeatureStore fails
         """
         try:
-            return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor))
+            logger.debug(f"Attaching to FeatureStore with descriptor: {descriptor}")
+            return cls(dragon_ddict.DDict.attach(descriptor))
         except Exception as ex:
             logger.error(f"Error creating dragon feature store: {descriptor}")
             raise SmartSimError(
diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py
index a55c52305..ac6cdaf31 100644
--- a/smartsim/_core/mli/infrastructure/storage/feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py
@@ -43,6 +43,14 @@ class ReservedKeys(str, enum.Enum):
     """Storage location for the list of registered consumers that will receive
     events from an EventBroadcaster"""
 
+    MLI_BACKEND_CONSUMER = "_SMARTIM_MLI_BACKEND_CONSUMER"
+    """Storage location for the channel used to send messages directly to
+    the MLI backend"""
+
+    MLI_WORKER_QUEUE = "to_worker_fli"  # todo: ensure this adheres to standard
+    """Storage location for the channel used to send work requests 
+    to the available worker managers"""
+
     @classmethod
     def contains(cls, value: str) -> bool:
         """Convert a string representation into an enumeration member.
@@ -59,7 +67,27 @@ def contains(cls, value: str) -> bool:
 
 
 @dataclass(frozen=True)
-class FeatureStoreKey:
+class TensorKey:
+    """A key,descriptor pair enabling retrieval of an item from a feature store."""
+
+    key: str
+    """The unique key of an item in a feature store"""
+    descriptor: str
+    """The unique identifier of the feature store containing the key"""
+
+    def __post_init__(self) -> None:
+        """Ensure the key and descriptor have at least one character.
+
+        :raises ValueError: If key or descriptor are empty strings
+        """
+        if len(self.key) < 1:
+            raise ValueError("Key must have at least one character.")
+        if len(self.descriptor) < 1:
+            raise ValueError("Descriptor must have at least one character.")
+
+
+@dataclass(frozen=True)
+class ModelKey:
     """A key,descriptor pair enabling retrieval of an item from a feature store."""
 
     key: str
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 530d25154..ac1a14866 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -39,17 +39,16 @@
 from ...comm.channel.channel import CommChannelBase
 from ...message_handler import MessageHandler
 from ...mli_schemas.model.model_capnp import Model
-from ..storage.feature_store import FeatureStore, FeatureStoreKey
+from ..storage.feature_store import FeatureStore, ModelKey, TensorKey
 
 if t.TYPE_CHECKING:
-    from smartsim._core.mli.mli_schemas.data.data_references_capnp import TensorKey
     from smartsim._core.mli.mli_schemas.response.response_capnp import Status
     from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor
 
 logger = get_logger(__name__)
 
 # Placeholder
-ModelIdentifier = FeatureStoreKey
+ModelIdentifier = ModelKey
 
 
 class InferenceRequest:
@@ -57,12 +56,12 @@ class InferenceRequest:
 
     def __init__(
         self,
-        model_key: t.Optional[FeatureStoreKey] = None,
+        model_key: t.Optional[ModelKey] = None,
         callback: t.Optional[CommChannelBase] = None,
         raw_inputs: t.Optional[t.List[bytes]] = None,
-        input_keys: t.Optional[t.List[FeatureStoreKey]] = None,
+        input_keys: t.Optional[t.List[TensorKey]] = None,
         input_meta: t.Optional[t.List[t.Any]] = None,
-        output_keys: t.Optional[t.List[FeatureStoreKey]] = None,
+        output_keys: t.Optional[t.List[TensorKey]] = None,
         raw_model: t.Optional[Model] = None,
         batch_size: int = 0,
     ):
@@ -153,7 +152,7 @@ class InferenceReply:
     def __init__(
         self,
         outputs: t.Optional[t.Collection[t.Any]] = None,
-        output_keys: t.Optional[t.Collection[FeatureStoreKey]] = None,
+        output_keys: t.Optional[t.Collection[TensorKey]] = None,
         status_enum: "Status" = "running",
         message: str = "In progress",
     ) -> None:
@@ -166,7 +165,7 @@ def __init__(
         """
         self.outputs: t.Collection[t.Any] = outputs or []
         """List of output data"""
-        self.output_keys: t.Collection[t.Optional[FeatureStoreKey]] = output_keys or []
+        self.output_keys: t.Collection[t.Optional[TensorKey]] = output_keys or []
         """List of keys used for output data"""
         self.status_enum = status_enum
         """Status of the reply"""
@@ -320,7 +319,7 @@ class RequestBatch:
     """List of InferenceRequests in the batch"""
     inputs: t.Optional[TransformInputResult]
     """Transformed batch of input tensors"""
-    model_id: ModelIdentifier
+    model_id: "ModelIdentifier"
     """Model (key, descriptor) tuple"""
 
     @property
@@ -350,7 +349,7 @@ def raw_model(self) -> t.Optional[t.Any]:
         return None
 
     @property
-    def input_keys(self) -> t.List[FeatureStoreKey]:
+    def input_keys(self) -> t.List[TensorKey]:
         """All input keys available in this batch's requests.
 
         :returns: All input keys belonging to requests in this batch"""
@@ -361,7 +360,7 @@ def input_keys(self) -> t.List[FeatureStoreKey]:
         return keys
 
     @property
-    def output_keys(self) -> t.List[FeatureStoreKey]:
+    def output_keys(self) -> t.List[TensorKey]:
         """All output keys available in this batch's requests.
 
         :returns: All output keys belonging to requests in this batch"""
@@ -378,7 +377,7 @@ class MachineLearningWorkerCore:
     @staticmethod
     def deserialize_message(
         data_blob: bytes,
-        callback_factory: t.Callable[[bytes], CommChannelBase],
+        callback_factory: t.Callable[[str], CommChannelBase],
     ) -> InferenceRequest:
         """Deserialize a message from a byte stream into an InferenceRequest.
 
@@ -388,27 +387,27 @@ def deserialize_message(
         :returns: The raw input message deserialized into an InferenceRequest
         """
         request = MessageHandler.deserialize_request(data_blob)
-        model_key: t.Optional[FeatureStoreKey] = None
+        model_key: t.Optional[ModelKey] = None
         model_bytes: t.Optional[Model] = None
 
         if request.model.which() == "key":
-            model_key = FeatureStoreKey(
+            model_key = ModelKey(
                 key=request.model.key.key,
-                descriptor=request.model.key.featureStoreDescriptor,
+                descriptor=request.model.key.descriptor,
             )
         elif request.model.which() == "data":
             model_bytes = request.model.data
 
         callback_key = request.replyChannel.descriptor
         comm_channel = callback_factory(callback_key)
-        input_keys: t.Optional[t.List[FeatureStoreKey]] = None
+        input_keys: t.Optional[t.List[TensorKey]] = None
         input_bytes: t.Optional[t.List[bytes]] = None
-        output_keys: t.Optional[t.List[FeatureStoreKey]] = None
+        output_keys: t.Optional[t.List[TensorKey]] = None
         input_meta: t.Optional[t.List[TensorDescriptor]] = None
 
         if request.input.which() == "keys":
             input_keys = [
-                FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor)
+                TensorKey(key=value.key, descriptor=value.descriptor)
                 for value in request.input.keys
             ]
         elif request.input.which() == "descriptors":
@@ -416,7 +415,7 @@ def deserialize_message(
 
         if request.output:
             output_keys = [
-                FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor)
+                TensorKey(key=value.key, descriptor=value.descriptor)
                 for value in request.output
             ]
 
@@ -545,7 +544,7 @@ def place_output(
         request: InferenceRequest,
         transform_result: TransformOutputResult,
         feature_stores: t.Dict[str, FeatureStore],
-    ) -> t.Collection[t.Optional[FeatureStoreKey]]:
+    ) -> t.Collection[t.Optional[TensorKey]]:
         """Given a collection of data, make it available as a shared resource in the
         feature store.
 
@@ -558,7 +557,7 @@ def place_output(
         if not feature_stores:
             raise ValueError("Feature store is required for output persistence")
 
-        keys: t.List[t.Optional[FeatureStoreKey]] = []
+        keys: t.List[t.Optional[TensorKey]] = []
         # need to decide how to get back to original sub-batch inputs so they can be
         # accurately placed, datum might need to include this.
 
diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py
index 71def143a..d7324e4a4 100644
--- a/smartsim/_core/mli/message_handler.py
+++ b/smartsim/_core/mli/message_handler.py
@@ -73,7 +73,7 @@ def build_output_tensor_descriptor(
         order, data type, and dimensions.
 
         :param order: Order of the tensor, such as row-major (c) or column-major (f)
-        :param keys: List of TensorKeys to apply transorm descriptor to
+        :param keys: List of TensorKey to apply transorm descriptor to
         :param data_type: Tranform data type of the tensor
         :param dimensions: Transform dimensions of the tensor
         :returns: The OutputDescriptor
@@ -92,14 +92,12 @@ def build_output_tensor_descriptor(
         return description
 
     @staticmethod
-    def build_tensor_key(
-        key: str, feature_store_descriptor: str
-    ) -> data_references_capnp.TensorKey:
+    def build_tensor_key(key: str, descriptor: str) -> data_references_capnp.TensorKey:
         """
         Builds a new TensorKey message with the provided key.
 
         :param key: String to set the TensorKey
-        :param feature_store_descriptor: A descriptor identifying the feature store
+        :param descriptor: A descriptor identifying the feature store
         containing the key
         :returns: The TensorKey
         :raises ValueError: If building fails
@@ -107,7 +105,7 @@ def build_tensor_key(
         try:
             tensor_key = data_references_capnp.TensorKey.new_message()
             tensor_key.key = key
-            tensor_key.featureStoreDescriptor = feature_store_descriptor
+            tensor_key.descriptor = descriptor
         except Exception as e:
             raise ValueError("Error building tensor key.") from e
         return tensor_key
@@ -133,14 +131,12 @@ def build_model(data: bytes, name: str, version: str) -> model_capnp.Model:
         return model
 
     @staticmethod
-    def build_model_key(
-        key: str, feature_store_descriptor: str
-    ) -> data_references_capnp.ModelKey:
+    def build_model_key(key: str, descriptor: str) -> data_references_capnp.ModelKey:
         """
         Builds a new ModelKey message with the provided key.
 
         :param key: String to set the ModelKey
-        :param feature_store_descriptor: A descriptor identifying the feature store
+        :param descriptor: A descriptor identifying the feature store
         containing the key
         :returns: The ModelKey
         :raises ValueError: If building fails
@@ -148,9 +144,9 @@ def build_model_key(
         try:
             model_key = data_references_capnp.ModelKey.new_message()
             model_key.key = key
-            model_key.featureStoreDescriptor = feature_store_descriptor
+            model_key.descriptor = descriptor
         except Exception as e:
-            raise ValueError("Error building model key.") from e
+            raise ValueError("Error building tensor key.") from e
         return model_key
 
     @staticmethod
@@ -242,7 +238,7 @@ def _assign_model(
 
     @staticmethod
     def _assign_reply_channel(
-        request: request_capnp.Request, reply_channel: bytes
+        request: request_capnp.Request, reply_channel: str
     ) -> None:
         """
         Assigns a reply channel to the supplied request.
@@ -360,7 +356,7 @@ def _assign_custom_request_attributes(
 
     @staticmethod
     def build_request(
-        reply_channel: bytes,
+        reply_channel: str,
         model: t.Union[data_references_capnp.ModelKey, model_capnp.Model],
         inputs: t.Union[
             t.List[data_references_capnp.TensorKey],
diff --git a/smartsim/_core/mli/mli_schemas/data/data_references.capnp b/smartsim/_core/mli/mli_schemas/data/data_references.capnp
index 699abe5d2..65293be7b 100644
--- a/smartsim/_core/mli/mli_schemas/data/data_references.capnp
+++ b/smartsim/_core/mli/mli_schemas/data/data_references.capnp
@@ -28,10 +28,10 @@
 
 struct ModelKey {
   key @0 :Text;
-  featureStoreDescriptor @1 :Text;
+  descriptor @1 :Text;
 }
 
 struct TensorKey {
   key @0 :Text;
-  featureStoreDescriptor @1 :Text;
+  descriptor @1 :Text;
 }
diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi
index bcf53e0a0..a5e318a55 100644
--- a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi
+++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi
@@ -36,7 +36,7 @@ from typing import Iterator
 
 class ModelKey:
     key: str
-    featureStoreDescriptor: str
+    descriptor: str
     @staticmethod
     @contextmanager
     def from_bytes(
@@ -72,7 +72,7 @@ class ModelKeyBuilder(ModelKey):
 
 class TensorKey:
     key: str
-    featureStoreDescriptor: str
+    descriptor: str
     @staticmethod
     @contextmanager
     def from_bytes(
diff --git a/smartsim/_core/mli/mli_schemas/request/request.capnp b/smartsim/_core/mli/mli_schemas/request/request.capnp
index 4be1cfa21..26d9542d9 100644
--- a/smartsim/_core/mli/mli_schemas/request/request.capnp
+++ b/smartsim/_core/mli/mli_schemas/request/request.capnp
@@ -32,7 +32,7 @@ using DataRef = import "../data/data_references.capnp";
 using Models = import "../model/model.capnp";
 
 struct ChannelDescriptor {
-  descriptor @0 :Data;
+  descriptor @0 :Text;
 }
 
 struct Request {
diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi
index a4ad631f9..2aab80b1d 100644
--- a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi
+++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi
@@ -61,7 +61,7 @@ from .request_attributes.request_attributes_capnp import (
 )
 
 class ChannelDescriptor:
-    descriptor: bytes
+    descriptor: str
     @staticmethod
     @contextmanager
     def from_bytes(
diff --git a/smartsim/log.py b/smartsim/log.py
index 3d6c0860e..a28112efa 100644
--- a/smartsim/log.py
+++ b/smartsim/log.py
@@ -252,7 +252,9 @@ def filter(self, record: logging.LogRecord) -> bool:
         return record.levelno <= level_no
 
 
-def log_to_file(filename: str, log_level: str = "debug") -> None:
+def log_to_file(
+    filename: str, log_level: str = "debug", logger: t.Optional[logging.Logger] = None
+) -> None:
     """Installs a second filestream handler to the root logger,
     allowing subsequent logging calls to be sent to filename.
 
@@ -261,7 +263,8 @@ def log_to_file(filename: str, log_level: str = "debug") -> None:
                       to allow the file to store more or less verbose
                       logging information.
     """
-    logger = logging.getLogger("SmartSim")
+    if logger is None:
+        logger = logging.getLogger("SmartSim")
     stream = open(  # pylint: disable=consider-using-with
         filename, "w+", encoding="utf-8"
     )
diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py
new file mode 100644
index 000000000..bf195a756
--- /dev/null
+++ b/smartsim/protoclient.py
@@ -0,0 +1,285 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# isort: off
+# pylint: disable=unused-import,import-error
+import dragon
+from dragon import fli
+import dragon.channels
+from dragon.globalservices.api_setup import connect_to_infrastructure
+
+# isort: on
+# pylint: enable=unused-import,import-error
+
+import numbers
+import os
+import time
+import typing as t
+from collections import OrderedDict
+
+import numpy
+import torch
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+    EventBroadcaster,
+    EventProducer,
+    OnWriteFeatureStore,
+)
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim._core.utils.timings import PerfTimer
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+# from mpi4py import MPI
+
+
+_TimingDict = OrderedDict[str, list[str]]
+
+
+logger = get_logger("App")
+logger.info("Started app")
+CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False
+
+
+class ProtoClient:
+    @staticmethod
+    def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore:
+        """Use the supplied environment variables to attach
+        to a pre-existing backbone featurestore. Requires the
+        environment to contain `_SMARTSIM_INFRA_BACKBONE`
+        environment variable
+
+        :returns: the attached backbone featurestore"""
+        # todo: ensure this env var from config loader or constant
+        descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None)
+        if descriptor is None:
+            raise SmartSimError(
+                "Missing required backbone configuration in environment"
+            )
+
+        backbone = t.cast(
+            BackboneFeatureStore, BackboneFeatureStore.from_descriptor(descriptor)
+        )
+        backbone.wait_timeout = wait_timeout
+        return backbone
+
+    def _attach_to_worker_queue(self) -> DragonFLIChannel:
+        """Wait until the backbone contains the worker queue configuration,
+        then attach an FLI to the given worker queue"""
+        configuration = self._backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE])
+        # descriptor = configuration.get(BackboneFeatureStore.MLI_WORKER_QUEUE, None)
+        # NOTE: without wait_for, this MUST be in the backbone....
+        # descriptor = self._backbone.worker_queue
+        descriptor = str(configuration[BackboneFeatureStore.MLI_WORKER_QUEUE])
+
+        if not descriptor:
+            raise ValueError("Unable to locate worker queue using backbone")
+
+        # self._to_worker_fli = DragonFLIChannel.from_descriptor(descriptor)
+        return DragonFLIChannel.from_descriptor(str(descriptor))
+
+    @staticmethod
+    def _create_worker_channels() -> t.Tuple[DragonCommChannel, DragonCommChannel]:
+        """Create channels to be used in the worker queue"""
+        # self._from_worker_ch = Channel.make_process_local()
+        _from_worker_ch = DragonCommChannel.from_local()
+        # self._from_worker_ch_serialized = self._from_worker_ch.serialize()
+        # self._to_worker_ch = Channel.make_process_local()
+        _to_worker_ch = DragonCommChannel.from_local()
+
+        return _from_worker_ch, _to_worker_ch
+
+    def _create_broadcaster(self) -> EventProducer:
+        """Create an event publisher that will broadcast updates to
+        other MLI components. This publisher
+
+        :returns: the event publisher instance"""
+        broadcaster: EventProducer = EventBroadcaster(
+            self._backbone, DragonCommChannel.from_descriptor
+        )
+        return broadcaster
+
+    def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None:
+        """Initialize the client instance
+
+        :param timing_on: Flag indicating if timing information should be
+        written to file
+        :param wait_timeout: Maximum wait time allowed to attach to the
+        worker queue
+
+        :raises: SmartSimError if unable to attach to a backbone featurestore"""
+        # comm = MPI.COMM_WORLD
+        # rank = comm.Get_rank()
+        rank: int = 0
+        self._queue_timeout = wait_timeout
+
+        connect_to_infrastructure()
+        # ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"]
+        # self._ddict = DDict.attach(ddict_str)
+        # self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor
+        self._backbone = self._attach_to_backbone(wait_timeout=wait_timeout)
+
+        # # to_worker_fli_str = None
+        # # while to_worker_fli_str is None:
+        # #     try:
+        # #         to_worker_fli_str = self._ddict["to_worker_fli"]
+        # #         self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str)
+        # #     except KeyError:
+        # #         time.sleep(1)
+
+        self._to_worker_fli = self._attach_to_worker_queue()
+
+        # # # self._from_worker_ch = Channel.make_process_local()
+        # # # self._from_worker_ch_serialized = self._from_worker_ch.serialize()
+        # # # self._to_worker_ch = Channel.make_process_local()
+        channels = self._create_worker_channels()
+        self._from_worker_ch = channels[0]
+        self._to_worker_ch = channels[1]
+
+        self._publisher = self._create_broadcaster()
+
+        self.perf_timer: PerfTimer = PerfTimer(
+            debug=False, timing_on=timing_on, prefix=f"a{rank}_"
+        )
+        self._start: t.Optional[float] = None
+        self._interm: t.Optional[float] = None
+        self._timings: _TimingDict = OrderedDict()
+        self._timing_on = timing_on
+
+    def _add_label_to_timings(self, label: str) -> None:
+        if label not in self._timings:
+            self._timings[label] = []
+
+    @staticmethod
+    def _format_number(number: t.Union[numbers.Number, float]) -> str:
+        return f"{number:0.4e}"
+
+    def start_timings(self, batch_size: numbers.Number) -> None:
+        if self._timing_on:
+            self._add_label_to_timings("batch_size")
+            self._timings["batch_size"].append(self._format_number(batch_size))
+            self._start = time.perf_counter()
+            self._interm = time.perf_counter()
+
+    def end_timings(self) -> None:
+        if self._timing_on and self._start is not None:
+            self._add_label_to_timings("total_time")
+            self._timings["total_time"].append(
+                self._format_number(time.perf_counter() - self._start)
+            )
+
+    def measure_time(self, label: str) -> None:
+        if self._timing_on and self._interm is not None:
+            self._add_label_to_timings(label)
+            self._timings[label].append(
+                self._format_number(time.perf_counter() - self._interm)
+            )
+            self._interm = time.perf_counter()
+
+    def print_timings(self, to_file: bool = False) -> None:
+        print(" ".join(self._timings.keys()))
+
+        value_array = numpy.array(self._timings.values(), dtype=float)
+        value_array = numpy.transpose(value_array)
+        for i in range(value_array.shape[0]):
+            print(" ".join(self._format_number(value) for value in value_array[i]))
+        if to_file:
+            numpy.save("timings.npy", value_array)
+            numpy.savetxt("timings.txt", value_array)
+
+    def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any:
+        tensors = [batch.numpy()]
+        self.perf_timer.start_timings("batch_size", batch.shape[0])
+        built_tensor_desc = MessageHandler.build_tensor_descriptor(
+            "c", "float32", list(batch.shape)
+        )
+        self.perf_timer.measure_time("build_tensor_descriptor")
+        if isinstance(model, str):
+            model_arg = MessageHandler.build_model_key(model, self._backbone.descriptor)
+        else:
+            model_arg = MessageHandler.build_model(
+                model, "resnet-50", "1.0"
+            )  # type: ignore
+        request = MessageHandler.build_request(
+            reply_channel=self._from_worker_ch.descriptor,
+            model=model_arg,
+            inputs=[built_tensor_desc],
+            outputs=[],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+        self.perf_timer.measure_time("build_request")
+        request_bytes = MessageHandler.serialize_request(request)
+        self.perf_timer.measure_time("serialize_request")
+
+        if self._to_worker_fli is None:
+            raise ValueError("No worker queue available.")
+
+        # pylint: disable-next=protected-access
+        with self._to_worker_fli._channel.sendh(  # type: ignore
+            timeout=None,
+            stream_channel=self._to_worker_ch.channel,
+        ) as to_sendh:
+            to_sendh.send_bytes(request_bytes)
+            self.perf_timer.measure_time("send_request")
+            for tensor in tensors:
+                to_sendh.send_bytes(tensor.tobytes())  # TODO NOT FAST ENOUGH!!!
+                # to_sendh.send_bytes(bytes(tensor.data))
+        logger.info(f"Message size: {len(request_bytes)} bytes")
+
+        self.perf_timer.measure_time("send_tensors")
+        with self._from_worker_ch.channel.recvh(timeout=None) as from_recvh:
+            resp = from_recvh.recv_bytes(timeout=None)
+            self.perf_timer.measure_time("receive_response")
+            response = MessageHandler.deserialize_response(resp)
+            self.perf_timer.measure_time("deserialize_response")
+            # list of data blobs?
+            # recv depending on the len(response.result.descriptors)?
+            data_blob: bytes = from_recvh.recv_bytes(timeout=None)
+            self.perf_timer.measure_time("receive_tensor")
+            result = torch.from_numpy(
+                numpy.frombuffer(
+                    data_blob,
+                    dtype=str(response.result.descriptors[0].dataType),
+                )
+            )
+            self.perf_timer.measure_time("deserialize_tensor")
+
+        self.perf_timer.end_timings()
+        return result
+
+    def set_model(self, key: str, model: bytes) -> None:
+        # todo: incorrect usage of backbone here to store
+        # user models? are we using the backbone if they do NOT
+        # have a feature store of their own?
+        self._backbone[key] = model
+
+        # notify components of a change in the data at this key
+        event = OnWriteFeatureStore(self._backbone.descriptor, key)
+        self._publisher.send(event)
diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py
new file mode 100644
index 000000000..a4e61d430
--- /dev/null
+++ b/tests/dragon/test_dragon_backend.py
@@ -0,0 +1,174 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import unittest.mock as mock
+
+import pytest
+
+# from smartsim._core.launcher.dragon.dragonBackend import DragonBackend, NodePrioritizer
+# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import EventSender, OnCreateConsumer
+
+# dragon = pytest.importorskip("dragon")
+
+# import dragon.utils as du
+# from dragon.channels import Channel
+# from dragon.data.ddict.ddict import DDict
+# from dragon.fli import DragonFLIError, FLInterface
+
+# from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+# from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader
+# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+#     DragonFeatureStore,
+# )
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+def test_dragonbackend_listener_boostrapping(monkeypatch: pytest.MonkeyPatch):
+    """Verify that an event listener is started"""
+    # backend_channel = DragonCommChannel.from_local()
+    assert True
+
+    # with monkeypatch.context() as patcher:
+    #     # patcher.setattr("smartsim._core.launcher.dragon.dragonBackend", "NodePrioritizer", mock.MagicMock())
+    #     patcher.setattr(NodePrioritizer, "__init__", lambda self, nodes, lock: None)
+    #     patcher.setattr(DragonBackend, "_initialize_hosts", lambda self: None)
+
+    # backend = DragonBackend(pid=9999)
+    # backend._create_backbone()
+
+    # # create the consumer and start a listener process
+    # backend_consumer = backend._create_eventing(backend._backbone)
+
+    # # ensure the consumer that was created is retained
+    # assert backend._event_consumer is not None
+    # assert backend._event_consumer == backend_consumer
+
+    # assert backend._backbone.notification_channels == [backend_consumer.descriptor]
+
+    # # create components to publish events
+    # # sender_channel = DragonCommChannel.from_local()
+    # sender = EventSender(backend._backbone, backend_channel)
+
+    # # simulate a new consumer registration
+    # new_consumer_channel = DragonCommChannel.from_local()
+    # registration = OnCreateConsumer(new_consumer_channel.descriptor)
+    # new_consumer_channel.send(bytes(registration), 0.1)
+
+    # events = backend_consumer.receive()
+    # assert len(events) == 1
+
+
+# @pytest.mark.parametrize(
+#     "content",
+#     [
+#         pytest.param(b"a"),
+#         pytest.param(b"new byte string"),
+#     ],
+# )
+# def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch):
+#     """A descriptor can be stored, loaded, and reattached"""
+#     chan = Channel.make_process_local()
+#     queue = FLInterface(main_ch=chan)
+#     monkeypatch.setenv(
+#         "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+#     )
+
+#     config = EnvironmentConfigLoader(
+#         featurestore_factory=DragonFeatureStore.from_descriptor,
+#         callback_factory=DragonCommChannel.from_descriptor,
+#         queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+#     )
+#     config_queue = config.get_queue()
+
+#     _ = config_queue.send(content)
+
+#     old_recv = queue.recvh()
+#     result, _ = old_recv.recv_bytes()
+#     assert result == content
+
+
+# def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
+#     """The serialized descriptors of a loaded and unloaded
+#     queue are the same"""
+#     chan = Channel.make_process_local()
+#     queue = FLInterface(main_ch=chan)
+#     monkeypatch.setenv(
+#         "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+#     )
+
+#     config = EnvironmentConfigLoader(
+#         featurestore_factory=DragonFeatureStore.from_descriptor,
+#         callback_factory=DragonCommChannel.from_descriptor,
+#         queue_factory=DragonFLIChannel.from_descriptor,
+#     )
+#     config_queue = config.get_queue()
+#     assert config_queue._fli.serialize() == queue.serialize()
+
+
+# def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
+#     """An incorrect serialized descriptor will fails to attach"""
+#     monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring")
+#     config = EnvironmentConfigLoader(
+#         featurestore_factory=DragonFeatureStore.from_descriptor,
+#         callback_factory=None,
+#         queue_factory=DragonFLIChannel.from_descriptor,
+#     )
+
+#     with pytest.raises(DragonFLIError):
+#         config.get_queue()
+
+
+# def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
+#     """Verify the dragon feature store is loaded correctly by the
+#     EnvironmentConfigLoader to demonstrate featurestore_factory correctness"""
+#     feature_store = DragonFeatureStore(DDict())
+#     monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor)
+
+#     config = EnvironmentConfigLoader(
+#         featurestore_factory=DragonFeatureStore.from_descriptor,
+#         callback_factory=None,
+#         queue_factory=None,
+#     )
+
+#     print(f"calling config.get_backbone: `{feature_store.descriptor}`")
+
+#     backbone = config.get_backbone()
+#     assert backbone is not None
+
+
+# def test_environment_variables_not_set():
+#     """EnvironmentConfigLoader getters return None when environment
+#     variables are not set"""
+#     config = EnvironmentConfigLoader(
+#         featurestore_factory=DragonFeatureStore.from_descriptor,
+#         callback_factory=DragonCommChannel.from_descriptor,
+#         queue_factory=DragonCommChannel.from_descriptor,
+#     )
+#     assert config.get_backbone() is None
+#     assert config.get_queue() is None
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index e9bcc8dfd..b8c2af9c0 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -63,7 +63,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
     )
     config_queue = config.get_queue()
 
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 618b00d87..0f3e38f93 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import typing as t
 from unittest.mock import MagicMock
 
 import pytest
@@ -38,6 +39,7 @@
 from dragon.fli import FLInterface
 from dragon.mpbridge.queues import DragonQueue
 
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.control.device_manager import WorkerDevice
 from smartsim._core.mli.infrastructure.control.request_dispatcher import (
@@ -62,11 +64,13 @@
     InferenceReply,
     InferenceRequest,
     LoadModelResult,
+    MachineLearningWorkerBase,
     RequestBatch,
     TransformInputResult,
     TransformOutputResult,
 )
 from smartsim._core.mli.message_handler import MessageHandler
+from smartsim._core.mli.mli_schemas.response.response_capnp import ResponseBuilder
 
 from .utils.channel import FileSystemCommChannel
 from .utils.worker import IntegratedTorchWorker
@@ -92,7 +96,7 @@ def app_feature_store() -> FeatureStore:
 
 @pytest.fixture
 def setup_worker_manager_model_bytes(
-    test_dir,
+    test_dir: str,
     monkeypatch: pytest.MonkeyPatch,
     backbone_descriptor: str,
     app_feature_store: FeatureStore,
@@ -110,10 +114,10 @@ def setup_worker_manager_model_bytes(
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
     )
 
-    dispatcher_task_queue = mp.Queue(maxsize=0)
+    dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0)
 
     worker_manager = WorkerManager(
         config_loader=config_loader,
@@ -123,10 +127,14 @@ def setup_worker_manager_model_bytes(
         cooldown=3,
     )
 
-    tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
-    output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+    tensor_key = MessageHandler.build_feature_store_key(
+        "key", app_feature_store.descriptor
+    )
+    output_key = MessageHandler.build_feature_store_key(
+        "key", app_feature_store.descriptor
+    )
 
-    request = InferenceRequest(
+    inf_request = InferenceRequest(
         model_key=None,
         callback=None,
         raw_inputs=None,
@@ -140,7 +148,7 @@ def setup_worker_manager_model_bytes(
     model_id = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
 
     request_batch = RequestBatch(
-        [request],
+        [inf_request],
         TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
         model_id=model_id,
     )
@@ -169,10 +177,10 @@ def setup_worker_manager_model_key(
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
     )
 
-    dispatcher_task_queue = mp.Queue(maxsize=0)
+    dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0)
 
     worker_manager = WorkerManager(
         config_loader=config_loader,
@@ -208,7 +216,7 @@ def setup_worker_manager_model_key(
 
 @pytest.fixture
 def setup_request_dispatcher_model_bytes(
-    test_dir,
+    test_dir: str,
     monkeypatch: pytest.MonkeyPatch,
     backbone_descriptor: str,
     app_feature_store: FeatureStore,
@@ -226,7 +234,7 @@ def setup_request_dispatcher_model_bytes(
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
     )
 
     request_dispatcher = RequestDispatcher(
@@ -237,8 +245,12 @@ def setup_request_dispatcher_model_bytes(
     )
     request_dispatcher._on_start()
 
-    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
-    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    tensor_key = MessageHandler.build_feature_store_key(
+        "key", app_feature_store.descriptor
+    )
+    output_key = MessageHandler.build_feature_store_key(
+        "key", app_feature_store.descriptor
+    )
     model = MessageHandler.build_model(b"model", "model name", "v 0.0.1")
     request = MessageHandler.build_request(
         test_dir, model, [tensor_key], [output_key], [], None
@@ -252,7 +264,7 @@ def setup_request_dispatcher_model_bytes(
 
 @pytest.fixture
 def setup_request_dispatcher_model_key(
-    test_dir,
+    test_dir: str,
     monkeypatch: pytest.MonkeyPatch,
     backbone_descriptor: str,
     app_feature_store: FeatureStore,
@@ -270,7 +282,7 @@ def setup_request_dispatcher_model_key(
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
     )
 
     request_dispatcher = RequestDispatcher(
@@ -281,9 +293,13 @@ def setup_request_dispatcher_model_key(
     )
     request_dispatcher._on_start()
 
-    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
-    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
-    model_key = MessageHandler.build_model_key(
+    tensor_key = MessageHandler.build_feature_store_key(
+        "key", app_feature_store.descriptor
+    )
+    output_key = MessageHandler.build_feature_store_key(
+        "key", app_feature_store.descriptor
+    )
+    model_key = MessageHandler.build_feature_store_key(
         key="model key", feature_store_descriptor=app_feature_store.descriptor
     )
     request = MessageHandler.build_request(
@@ -296,8 +312,12 @@ def setup_request_dispatcher_model_key(
     return request_dispatcher, integrated_worker_type
 
 
-def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage):
-    def mock_stage(*args, **kwargs):
+def mock_pipeline_stage(
+    monkeypatch: pytest.MonkeyPatch,
+    integrated_worker: MachineLearningWorkerBase,
+    stage: str,
+) -> t.Callable[[t.Any], ResponseBuilder]:
+    def mock_stage(*args: t.Any, **kwargs: t.Any) -> None:
         raise ValueError(f"Simulated error in {stage}")
 
     monkeypatch.setattr(integrated_worker, stage, mock_stage)
@@ -314,8 +334,10 @@ def mock_stage(*args, **kwargs):
     mock_reply_channel = MagicMock()
     mock_reply_channel.send = MagicMock()
 
-    def mock_exception_handler(exc, reply_channel, failure_message):
-        return exception_handler(exc, mock_reply_channel, failure_message)
+    def mock_exception_handler(
+        exc: Exception, reply_channel: CommChannelBase, failure_message: str
+    ) -> None:
+        exception_handler(exc, mock_reply_channel, failure_message)
 
     monkeypatch.setattr(
         "smartsim._core.mli.infrastructure.control.worker_manager.exception_handler",
@@ -362,12 +384,12 @@ def mock_exception_handler(exc, reply_channel, failure_message):
     ],
 )
 def test_wm_pipeline_stage_errors_handled(
-    request,
-    setup_worker_manager,
+    request: pytest.FixtureRequest,
+    setup_worker_manager: str,
     monkeypatch: pytest.MonkeyPatch,
     stage: str,
     error_message: str,
-):
+) -> None:
     """Ensures that the worker manager does not crash after a failure in various pipeline stages"""
     worker_manager, integrated_worker_type = request.getfixturevalue(
         setup_worker_manager
@@ -446,12 +468,12 @@ def test_wm_pipeline_stage_errors_handled(
     ],
 )
 def test_dispatcher_pipeline_stage_errors_handled(
-    request,
-    setup_request_dispatcher,
+    request: pytest.FixtureRequest,
+    setup_request_dispatcher: str,
     monkeypatch: pytest.MonkeyPatch,
     stage: str,
     error_message: str,
-):
+) -> None:
     """Ensures that the request dispatcher does not crash after a failure in various pipeline stages"""
     request_dispatcher, integrated_worker_type = request.getfixturevalue(
         setup_request_dispatcher
@@ -473,7 +495,7 @@ def test_dispatcher_pipeline_stage_errors_handled(
     mock_reply_fn.assert_called_with("fail", error_message)
 
 
-def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch):
+def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch) -> None:
     """Ensures that the worker manager does not crash after a failure in the
     execute pipeline stage"""
 
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
new file mode 100644
index 000000000..a2c8118ac
--- /dev/null
+++ b/tests/dragon/test_featurestore.py
@@ -0,0 +1,338 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import multiprocessing as mp
+import random
+import time
+import typing as t
+import unittest.mock as mock
+import uuid
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+    EventBroadcaster,
+    EventCategory,
+    EventConsumer,
+    OnCreateConsumer,
+    OnWriteFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    time as bbtime,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+# isort: off
+from dragon import fli
+from dragon.channels import Channel
+
+# isort: on
+
+if t.TYPE_CHECKING:
+    import conftest
+
+
+# The tests in this file must run in a dragon environment
+pytestmark = pytest.mark.dragon
+WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
+
+
+@pytest.fixture
+def storage_for_dragon_fs() -> t.Dict[str, str]:
+    return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
+
+
+@pytest.fixture
+def storage_for_dragon_fs_with_req_queue(
+    storage_for_dragon_fs: t.Dict[str, str]
+) -> t.Dict[str, str]:
+    # create a valid FLI so any call to attach does not fail
+    channel_ = Channel.make_process_local()
+    fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
+    comm_channel = DragonFLIChannel(fli_, True)
+
+    storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor
+    return storage_for_dragon_fs
+
+
+@pytest.fixture
+def storage_for_dragon_fs_with_mock_req_queue(
+    storage_for_dragon_fs: t.Dict[str, str]
+) -> t.Dict[str, str]:
+    # # create a valid FLI so any call to attach does not fail
+    # channel_ = Channel.make_process_local()
+    # fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
+    # comm_channel = DragonFLIChannel(fli_, True)
+
+    mock_descriptor = "12345"
+    storage_for_dragon_fs[WORK_QUEUE_KEY] = mock_descriptor
+    return storage_for_dragon_fs
+
+
+def test_eventconsumer_eventpublisher_integration(
+    storage_for_dragon_fs: t.Any, test_dir: str
+) -> None:
+    """Verify that the publisher and consumer integrate as expected when
+    multiple publishers and consumers are sending simultaneously. This
+    test closely tracks the test in tests/test_featurestore.py also named
+    test_eventconsumer_eventpublisher_integration but requires dragon entities
+
+    :param storage_for_dragon_fs: the dragon storage engine to use
+    :param test_dir: pytest fixture automatically generating unique working
+    directories for individual test outputs"""
+
+    mock_storage = storage_for_dragon_fs
+    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+
+    # verify ability to write and read from ddict
+    backbone["test_dir"] = test_dir
+    assert backbone["test_dir"] == test_dir
+
+    wmgr_channel_ = Channel.make_process_local()
+    capp_channel_ = Channel.make_process_local()
+    back_channel_ = Channel.make_process_local()
+
+    wmgr_channel = DragonCommChannel(wmgr_channel_)
+    capp_channel = DragonCommChannel(capp_channel_)
+    back_channel = DragonCommChannel(back_channel_)
+
+    wmgr_consumer_descriptor = wmgr_channel.descriptor_string
+    capp_consumer_descriptor = capp_channel.descriptor_string
+    back_consumer_descriptor = back_channel.descriptor_string
+
+    # create some consumers to receive messages
+    wmgr_consumer = EventConsumer(
+        wmgr_channel,
+        backbone,
+        filters=[EventCategory.FEATURE_STORE_WRITTEN],
+    )
+    capp_consumer = EventConsumer(
+        capp_channel,
+        backbone,
+    )
+    back_consumer = EventConsumer(
+        back_channel,
+        backbone,
+        filters=[EventCategory.CONSUMER_CREATED],
+    )
+
+    # create some broadcasters to publish messages
+    mock_worker_mgr = EventBroadcaster(
+        backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+    mock_client_app = EventBroadcaster(
+        backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+
+    # register all of the consumers even though the OnCreateConsumer really should
+    # trigger its registration. event processing is tested elsewhere.
+    backbone.notification_channels = [
+        wmgr_consumer_descriptor,
+        capp_consumer_descriptor,
+        back_consumer_descriptor,
+    ]
+
+    # simulate worker manager sending a notification to backend that it's alive
+    event_1 = OnCreateConsumer(wmgr_consumer_descriptor, [])
+    mock_worker_mgr.send(event_1)
+
+    # simulate the app updating a model a few times
+    event_2 = OnWriteFeatureStore(backbone.descriptor, "key-1")
+    event_3 = OnWriteFeatureStore(backbone.descriptor, "key-2")
+    event_4 = OnWriteFeatureStore(backbone.descriptor, "key-1")
+
+    mock_client_app.send(event_2)
+    mock_client_app.send(event_3)
+    mock_client_app.send(event_4)
+
+    # worker manager should only get updates about feature update
+    wmgr_messages = wmgr_consumer.receive()
+    assert len(wmgr_messages) == 3
+
+    # the backend should only receive messages about consumer creation
+    back_messages = back_consumer.receive()
+    assert len(back_messages) == 1
+
+    # hypothetical app has no filters and will get all events
+    app_messages = capp_consumer.receive()
+    assert len(app_messages) == 4
+
+
+def test_backbone_wait_for_prepopulated(
+    storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that asking the backbone to wait for a value succeed
+    immediately and do not cause a wait to occur if the data exists
+
+    :param storage_for_dragon_fs: the storage engine to use, prepopulated with
+    """
+    # set a very low timeout to confirm that it does not wait
+    wait_timeout = 0.1
+    # storage = {WORK_QUEUE_KEY: "123456"}
+    storage = storage_for_dragon_fs_with_req_queue
+
+    backbone = BackboneFeatureStore(storage)
+
+    with monkeypatch.context() as ctx:
+        # all keys should be found and the timeout should never be checked.
+        ctx.setattr(bbtime, "sleep", mock.MagicMock())
+
+        values = backbone.wait_for([WORK_QUEUE_KEY])
+
+        # confirm that wait_for with one key returns one value
+        assert len(values) == 1
+
+        # confirm that the descriptor is non-null w/some non-trivial value
+        assert len(values[WORK_QUEUE_KEY]) > 5
+
+        # confirm that no wait occurred
+        bbtime.sleep.assert_not_called()
+
+
+def set_value_after_delay(
+    descriptor: str, key: str, value: str, delay: float = 5
+) -> None:
+    """Helper method to persist a random value into the backbone
+
+    :param descriptor: the backbone feature store descriptor to attach to
+    :param key: the key to write to
+    :param value: a value to write to the key"""
+    time.sleep(delay)
+
+    backbone = BackboneFeatureStore.from_descriptor(descriptor)
+    backbone[key] = value
+    logger.debug(f"set_value_after_delay wrote `{value} to backbone[`{key}`]")
+
+
+@pytest.mark.parametrize("delay", [0, 1, 2, 4, 8])
+def test_backbone_wait_for_partial_prepopulated(
+    storage_for_dragon_fs_with_mock_req_queue: t.Any, delay: float
+) -> None:
+    """Verify that when data is not all in the backbone, the `wait_for` operation
+    continues to poll until it finds everything it needs
+
+    :param storage_for_dragon_fs: the storage engine to use, prepopulated with
+    :param delay: the number of seconds the second process will wait before
+    setting the target value in the backbone featurestore
+    """
+    # set a very low timeout to confirm that it does not wait
+    wait_timeout = 10
+    storage = storage_for_dragon_fs_with_mock_req_queue
+    backbone = BackboneFeatureStore(storage)
+
+    key, value = str(uuid.uuid4()), str(random.random() * 10)
+
+    logger.debug(f"Starting process to write {key} after {delay}s")
+    p = mp.Process(
+        target=set_value_after_delay, args=(backbone.descriptor, key, value, delay)
+    )
+    p.start()
+
+    p2 = mp.Process(
+        target=backbone.wait_for,
+        args=([WORK_QUEUE_KEY, key],),
+        kwargs={"timeout": wait_timeout},
+    )
+    p2.start()
+
+    p.join()
+    p2.join()
+
+    # both values should be written at this time
+    ret_vals = backbone.wait_for([WORK_QUEUE_KEY, key], 0.1)
+    # confirm that wait_for with two keys returns two values
+    assert len(ret_vals) == 2, "values should contain values for both awaited keys"
+
+    # confirm the pre-populated value has the correct output
+    assert ret_vals[WORK_QUEUE_KEY] == "12345"  # mock descriptor value from fixture
+
+    # confirm the population process completed and the awaited value is correct
+    assert ret_vals[key] == value, "verify order of values "
+
+
+@pytest.mark.parametrize("num_keys", [0, 1, 3, 7, 11])
+def test_backbone_wait_for_multikey(
+    storage_for_dragon_fs_with_req_queue: t.Any,
+    num_keys: int,
+) -> None:
+    """Verify that asking the backbone to wait for multiple keys results
+    in that number of values being returned
+
+    :param storage_for_dragon_fs: the storage engine to use, prepopulated with
+    :param num_keys: the number of extra keys to set & request in the backbone
+    """
+    # maximum delay allowed for setter processes
+    max_delay = 5
+    storage = storage_for_dragon_fs_with_req_queue
+    backbone = BackboneFeatureStore(storage)
+
+    extra_keys = [str(uuid.uuid4()) for _ in range(num_keys)]
+    extra_values = [str(uuid.uuid4()) for _ in range(num_keys)]
+    extras = dict(zip(extra_keys, extra_values))
+    delays = [random.random() * max_delay for _ in range(num_keys)]
+    processes = []
+
+    for key, value, delay in zip(extra_keys, extra_values, delays):
+        assert delay < max_delay, "write delay exceeds test timeout"
+        logger.debug(f"Delaying {key} write by {delay} seconds")
+        p = mp.Process(
+            target=set_value_after_delay, args=(backbone.descriptor, key, value, delay)
+        )
+        p.start()
+        processes.append(p)
+
+    p2 = mp.Process(
+        target=backbone.wait_for,
+        args=([[*extra_keys]],),
+        kwargs={"timeout": max_delay * 2},
+    )
+    p2.start()
+    for p in processes:
+        p.join(timeout=max_delay * 2)
+    p2.join(
+        timeout=max_delay * 2
+    )  # give it 10 seconds longer than p2 timeout for backoff
+
+    # use without a wait to verify all values are written
+    actual_values = backbone.wait_for([*extra_keys], timeout=0.01)
+
+    # confirm that wait_for returns all the expected values
+    assert len(actual_values) == num_keys
+
+    # confirm that the returned values match (e.g. are returned in the right order)
+    for k in extras:
+        assert extras[k] == actual_values[k]
diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py
index 932e734c8..bb5dccad7 100644
--- a/tests/dragon/test_featurestore_base.py
+++ b/tests/dragon/test_featurestore_base.py
@@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import pathlib
+import time
 import typing as t
 
 import pytest
@@ -54,6 +55,21 @@
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
 
+WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
+RANDOMLY_SET_KEY = "_SOMETHING_ELSE"
+
+
+@pytest.fixture
+def storage_for_dragon_fs_with_req_queue() -> t.Dict[str, str]:
+    storage = {WORK_QUEUE_KEY: "12345", RANDOMLY_SET_KEY: "67890"}
+    return storage
+
+
+def boom(*args, **kwargs) -> None:
+    """Helper function that blows up when used to mock up
+    some other function"""
+    raise Exception(f"you shall not pass! {args}, {kwargs}")
+
 
 def test_event_uid() -> None:
     """Verify that all events include a unique identifier"""
@@ -62,7 +78,7 @@ def test_event_uid() -> None:
 
     # generate a bunch of events and keep track all the IDs
     for i in range(num_iters):
-        event_a = OnCreateConsumer(str(i))
+        event_a = OnCreateConsumer(str(i), filters=[])
         event_b = OnWriteFeatureStore(str(i), "key")
 
         uids.add(event_a.uid)
@@ -177,7 +193,7 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None:
     # NOTE: we're not putting any consumers into the backbone here!
     backbone = BackboneFeatureStore(mock_storage)
 
-    event = OnCreateConsumer(consumer_descriptor)
+    event = OnCreateConsumer(consumer_descriptor, filters=[])
 
     publisher = EventBroadcaster(backbone)
     num_receivers = 0
@@ -185,7 +201,7 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None:
     # publishing this event without any known consumers registered should succeed
     # but report that it didn't have anybody to send the event to
     consumer_descriptor = storage_path / f"test-consumer"
-    event = OnCreateConsumer(consumer_descriptor)
+    event = OnCreateConsumer(consumer_descriptor, filters=[])
 
     num_receivers += publisher.send(event)
 
@@ -215,7 +231,7 @@ def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None:
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
     backbone.notification_channels = []
 
-    event = OnCreateConsumer(consumer_descriptor)
+    event = OnCreateConsumer(consumer_descriptor, filters=[])
     publisher = EventBroadcaster(
         backbone, channel_factory=FileSystemCommChannel.from_descriptor
     )
@@ -247,7 +263,7 @@ def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
     backbone.notification_channels = [consumer_descriptor]
 
-    event = OnCreateConsumer(consumer_descriptor)
+    event = OnCreateConsumer(consumer_descriptor, filters=[])
     publisher = EventBroadcaster(
         backbone,
         # channel_factory=FileSystemCommChannel.from_descriptor # <--- not supplied
@@ -281,11 +297,11 @@ def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None:
     # mock building up some buffered events
     num_buffered_events = 14
     for i in range(num_buffered_events):
-        event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}")
+        event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", [])
         publisher._event_buffer.append(bytes(event))
 
     event0 = OnCreateConsumer(
-        storage_path / f"test-consumer-{str(num_buffered_events + 1)}"
+        storage_path / f"test-consumer-{str(num_buffered_events + 1)}", []
     )
 
     num_receivers = publisher.send(event0)
@@ -332,13 +348,13 @@ def test_eventpublisher_broadcast_returns_total_sent(
 
     # mock building up some buffered events
     for i in range(num_buffered):
-        event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}")
+        event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", [])
         publisher._event_buffer.append(bytes(event))
 
     assert publisher.num_buffered == num_buffered
 
     # this event will trigger clearing anything already in buffer
-    event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}")
+    event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}", [])
 
     # num_receivers should contain a number that computes w/all consumers and all events
     num_receivers = publisher.send(event0)
@@ -363,7 +379,7 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None:
         backbone, channel_factory=FileSystemCommChannel.from_descriptor
     )
 
-    event = OnCreateConsumer(consumer_descriptor)
+    event = OnCreateConsumer(consumer_descriptor, filters=[])
 
     # the only registered cnosumer is in the event, expect no pruning
     backbone.notification_channels = (consumer_descriptor,)
@@ -377,7 +393,7 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None:
     # ... and remove the old descriptor from the backbone when it's looked up
     backbone.notification_channels = (consumer_descriptor2,)
 
-    event = OnCreateConsumer(consumer_descriptor2)
+    event = OnCreateConsumer(consumer_descriptor2, filters=[])
 
     publisher.send(event)
 
@@ -433,7 +449,7 @@ def test_eventpublisher_serialize_failure(
     )
 
     with monkeypatch.context() as patch:
-        event = OnCreateConsumer(target_descriptor)
+        event = OnCreateConsumer(target_descriptor, filters=[])
 
         # patch the __bytes__ implementation to cause pickling to fail during send
         patch.setattr(event, "__bytes__", lambda x: b"abc")
@@ -471,7 +487,7 @@ def boom(descriptor: str) -> None:
     publisher = EventBroadcaster(backbone, channel_factory=boom)
 
     with monkeypatch.context() as patch:
-        event = OnCreateConsumer(target_descriptor)
+        event = OnCreateConsumer(target_descriptor, filters=[])
 
         backbone.notification_channels = (target_descriptor,)
 
@@ -507,7 +523,7 @@ def boom(self) -> None:
         raise Exception("That was unexpected...")
 
     with monkeypatch.context() as patch:
-        event = OnCreateConsumer(target_descriptor)
+        event = OnCreateConsumer(target_descriptor, filters=[])
 
         # patch the _broadcast implementation to cause send to fail after
         # after the event has been pickled
@@ -538,7 +554,7 @@ def test_eventconsumer_receive(test_dir: str) -> None:
 
     backbone = BackboneFeatureStore(mock_storage)
     comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor)
-    event = OnCreateConsumer(target_descriptor)
+    event = OnCreateConsumer(target_descriptor, filters=[])
 
     # simulate a sent event by writing directly to the input comm channel
     comm_channel.send(bytes(event))
@@ -574,7 +590,7 @@ def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None:
 
     # simulate multiple sent events by writing directly to the input comm channel
     for _ in range(num_sent):
-        event = OnCreateConsumer(target_descriptor)
+        event = OnCreateConsumer(target_descriptor, filters=[])
         comm_channel.send(bytes(event))
 
     consumer = EventConsumer(comm_channel, backbone)
@@ -628,9 +644,9 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None:
     capp_channel = FileSystemCommChannel(storage_path / "test-capp")
     back_channel = FileSystemCommChannel(storage_path / "test-backend")
 
-    wmgr_consumer_descriptor = wmgr_channel.descriptor.decode("utf-8")
-    capp_consumer_descriptor = capp_channel.descriptor.decode("utf-8")
-    back_consumer_descriptor = back_channel.descriptor.decode("utf-8")
+    wmgr_consumer_descriptor = wmgr_channel.descriptor
+    capp_consumer_descriptor = capp_channel.descriptor
+    back_consumer_descriptor = back_channel.descriptor
 
     # create some consumers to receive messages
     wmgr_consumer = EventConsumer(
@@ -667,7 +683,7 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None:
     ]
 
     # simulate worker manager sending a notification to backend that it's alive
-    event_1 = OnCreateConsumer(wmgr_consumer_descriptor)
+    event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[])
     mock_worker_mgr.send(event_1)
 
     # simulate the app updating a model a few times
@@ -721,3 +737,43 @@ def test_eventconsumer_batch_timeout(
         )
 
     assert "positive" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "wait_timeout, exp_wait_max",
+    [
+        # aggregate the 1+1+1 into 3 on remaining parameters
+        pytest.param(1, 1 + 1 + 1, id="1s wait, 3 cycle steps"),
+        pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"),
+        pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"),
+        pytest.param(9, 3 + 2 + 4 + 8, id="9s wait, 6 cycle steps"),
+        # aggregate an entire cycle into 16
+        pytest.param(19.5, 16 + 3 + 2 + 4, id="20s wait, repeat cycle"),
+    ],
+)
+def test_backbone_wait_timeout(wait_timeout: float, exp_wait_max: float) -> None:
+    """Verify that attempts to attach to the worker queue from the protoclient
+    timeout in an appropriate amount of time. Note: due to the backoff, we verify
+    the elapsed time is less than the 15s of a cycle of waits
+
+    :param storage_for_dragon_fs: the dragon storage engine to use
+    """
+
+    # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8]
+    # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
+    start_time = time.time()
+
+    storage = {}
+    backbone = BackboneFeatureStore(storage)
+
+    with pytest.raises(SmartSimError) as ex:
+        backbone.wait_for(["does-not-exist"])
+
+    end_time = time.time()
+    elapsed = end_time - start_time
+
+    # confirm that we met our timeout
+    assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}"
+
+    # confirm that the total wait time is aligned with the sleep cycle
+    assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}"
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index 59801eebe..104acd914 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -35,7 +35,6 @@
     DragonCommChannel,
     create_local,
 )
-from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
     EventBroadcaster,
@@ -131,7 +130,7 @@ def test_eventconsumer_eventpublisher_integration(
     ]
 
     # simulate worker manager sending a notification to backend that it's alive
-    event_1 = OnCreateConsumer(wmgr_consumer_descriptor)
+    event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[])
     mock_worker_mgr.send(event_1)
 
     # simulate the app updating a model a few times
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
new file mode 100644
index 000000000..590780154
--- /dev/null
+++ b/tests/dragon/test_protoclient.py
@@ -0,0 +1,231 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pickle
+import time
+import typing as t
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+    EventBroadcaster,
+    OnWriteFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict
+from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+# isort: off
+from dragon import fli
+from dragon.channels import Channel
+
+# from ..ex..high_throughput_inference.mock_app import ProtoClient
+from smartsim.protoclient import ProtoClient
+
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
+logger = get_logger(__name__)
+
+
+@pytest.fixture
+def storage_for_dragon_fs() -> t.Dict[str, str]:
+    # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
+    return dragon_ddict.DDict(1, 2, 4 * 1024**2)
+
+
+@pytest.fixture
+def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
+    return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
+
+
+@pytest.fixture
+def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
+    """a stand-in for the worker manager so a worker queue exists"""
+
+    # create the FLI
+    to_worker_channel = Channel.make_process_local()
+    # to_worker_channel = create_local()
+    fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    comm_channel = DragonFLIChannel(fli_, True)
+
+    # store the descriptor in the backbone
+    # the_backbone.worker_queue = comm_channel.descriptor
+    the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor
+
+    try:
+        comm_channel.send(b"foo")
+    except Exception as ex:
+        print(f"ohnooooo: {ex}")
+
+    return comm_channel
+
+
+@pytest.fixture
+def storage_for_dragon_fs_with_req_queue(
+    storage_for_dragon_fs: t.Dict[str, str]
+) -> t.Dict[str, str]:
+    # create a valid FLI so any call to attach does not fail
+    channel_ = Channel.make_process_local()
+    fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
+    comm_channel = DragonFLIChannel(fli_, True)
+
+    storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor
+    return storage_for_dragon_fs
+
+
+@pytest.mark.parametrize(
+    "wait_timeout, exp_wait_max",
+    [
+        # aggregate the 1+1+1 into 3 on remaining parameters
+        pytest.param(1, 1 + 1 + 1, id="1s wait, 3 cycle steps"),
+        pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"),
+        pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"),
+    ],
+)
+def test_protoclient_timeout(
+    wait_timeout: float,
+    exp_wait_max: float,
+    the_backbone: BackboneFeatureStore,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Verify that attempts to attach to the worker queue from the protoclient
+    timeout in an appropriate amount of time. Note: due to the backoff, we verify
+    the elapsed time is less than the 15s of a cycle of waits
+
+    :param wait_timeout: a timeout for use when configuring a proto client
+    :param exp_wait_max: a ceiling for the expected time spent waiting for
+    the timeout
+    :param the_backbone: a pre-initialized backbone featurestore for setting up
+    the environment variable required by the client"""
+
+    # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8]
+    # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
+    start_time = time.time()
+    with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex:
+        ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
+
+        ProtoClient(False, wait_timeout=wait_timeout)
+
+    end_time = time.time()
+    elapsed = end_time - start_time
+
+    # todo: revisit. should this trigger any wait if the backbone is set above?
+    # confirm that we met our timeout
+    # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}"
+
+    # confirm that the total wait time is aligned with the sleep cycle
+    assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}"
+
+
+def test_protoclient_initialization_no_backbone():
+    """Verify that attempting to start the client without required environment variables
+    results in an exception. NOTE: Backbone env var is not set"""
+
+    with pytest.raises(SmartSimError) as ex:
+        ProtoClient(timing_on=False)
+
+    # confirm the missing value error has been raised
+    assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" ")))
+
+
+def test_protoclient_initialization(
+    the_backbone: BackboneFeatureStore,
+    the_worker_queue: DragonFLIChannel,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Verify that attempting to start the client with required env vars results
+    in a fully initialized client
+
+    :param the_backbone: a pre-initialized backbone featurestore
+    :param the_worker_queue: an FLI channel the client will retrieve
+    from the backbone"""
+
+    with monkeypatch.context() as ctx:
+        ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
+        # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture
+
+        client = ProtoClient(timing_on=False)
+
+        # confirm the backbone was attached correctly
+        assert client._backbone is not None
+        assert client._backbone.descriptor == the_backbone.descriptor
+
+        # confirm the worker queue is created and attached correctly
+        assert client._to_worker_fli is not None
+        assert client._to_worker_fli.descriptor == the_worker_queue.descriptor
+
+        # confirm the worker channels are created
+        assert client._from_worker_ch is not None
+        assert client._from_worker_ch.descriptor
+
+        assert client._to_worker_ch is not None
+        assert client._to_worker_ch.descriptor
+
+        # confirm a publisher is created
+        assert client._publisher is not None
+
+
+def test_protoclient_write_model(
+    the_backbone: BackboneFeatureStore,
+    the_worker_queue: DragonFLIChannel,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Verify that writing a model using the client causes the model data to be
+    written to a feature store and triggers a key-written event
+
+    :param the_backbone: a pre-initialized backbone featurestore
+    :param the_worker_queue: an FLI channel the client will retrieve
+    from the backbone"""
+
+    with monkeypatch.context() as ctx:
+        ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
+        # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture
+
+        client = ProtoClient(timing_on=False)
+
+        model_key = "my-model"
+        model_bytes = b"12345"
+
+        client.set_model(model_key, model_bytes)
+
+        # confirm the client modified the underlying feature store
+        assert client._backbone[model_key] == model_bytes
+
+        publisher = t.cast(EventBroadcaster, client._publisher)
+
+        # confirm the client raised the key-written event
+        assert len(publisher._event_buffer) == 1
+
+        event = t.cast(OnWriteFeatureStore, pickle.loads(publisher._event_buffer.pop()))
+        assert event.descriptor == the_backbone.descriptor
+        assert event.key == model_key
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index ccdbce58c..714492f37 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -53,6 +53,7 @@
 import dragon.infrastructure.policy as dragon_policy
 import dragon.infrastructure.process_desc as dragon_process_desc
 import dragon.native.process as dragon_process
+import torch.nn as nn
 from dragon import fli
 from dragon.channels import Channel
 from dragon.data.ddict.ddict import DDict
@@ -86,6 +87,35 @@
 pytestmark = pytest.mark.dragon
 
 
+class MiniModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self._name = "mini-model"
+        self._net = torch.nn.Linear(2, 1)
+
+    def forward(self, input):
+        return self._net(input)
+
+    @property
+    def bytes(self) -> bytes:
+        """Returns the model serialized to a byte stream"""
+        buffer = io.BytesIO()
+        scripted = torch.jit.trace(self._net, self.get_batch())
+        torch.jit.save(scripted, buffer)
+        return buffer.getvalue()
+
+    @classmethod
+    def get_batch(cls) -> "torch.Tensor":
+        return torch.randn((100, 2), dtype=torch.float32)
+
+
+def load_model() -> bytes:
+    """Create a simple torch model in memory for testing"""
+    mini_model = MiniModel()
+    return mini_model.bytes
+
+
 def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
     """Create a simple torch model and persist to disk for
     testing purposes.
@@ -106,29 +136,17 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
 def mock_messages(
     request_dispatcher_queue: DragonFLIChannel,
     feature_store: FeatureStore,
-    feature_store_root_dir: pathlib.Path,
-    comm_channel_root_dir: pathlib.Path,
 ) -> None:
     """Mock event producer for triggering the inference pipeline"""
-    feature_store_root_dir.mkdir(parents=True, exist_ok=True)
-    comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
-
-    model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt")
-    model_bytes = model_path.read_bytes()
-    model_key = str(feature_store_root_dir / "model_fs.pt")
-
-    feature_store[model_key] = model_bytes
+    model_key = "mini-model"
 
     for iteration_number in range(2):
 
         channel = Channel.make_process_local()
         callback_channel = DragonCommChannel(channel)
+        output_key = f"output-{iteration_number}"
 
-        input_path = feature_store_root_dir / f"{iteration_number}/input.pt"
-        output_path = feature_store_root_dir / f"{iteration_number}/output.pt"
-
-        input_key = str(input_path)
-        output_key = str(output_path)
+        feature_store[model_key] = load_model()
 
         tensor = (
             (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32)
@@ -139,12 +157,13 @@ def mock_messages(
             "c", "float32", list(tensor.shape)
         )
 
-        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
-        message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd)
-        message_model_key = MessageHandler.build_model_key(model_key, fsd)
+        message_tensor_output_key = MessageHandler.build_feature_store_key(
+            output_key, fsd
+        )
+        message_model_key = MessageHandler.build_feature_store_key(model_key, fsd)
 
         request = MessageHandler.build_request(
-            reply_channel=base64.b64encode(channel.serialize()).decode("utf-8"),
+            reply_channel=callback_channel.descriptor,
             model=message_model_key,
             inputs=[tensor_desc],
             outputs=[message_tensor_output_key],
@@ -190,25 +209,20 @@ def service_as_dragon_proc(
     )
 
 
-def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
+def test_request_dispatcher() -> None:
     """Test the request dispatcher batching and queueing system
 
     This also includes setting a queue to disposable, checking that it is no
     longer referenced by the dispatcher.
     """
 
-    test_path = prepare_environment
-    fs_path = test_path / "feature_store"
-    comm_path = test_path / "comm_store"
-
     to_worker_channel = dch.Channel.make_process_local()
     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-    to_worker_fli_serialized = to_worker_fli.serialize()
+    to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, sender_supplied=True)
 
     # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
     # or test environment may be unable to send messages w/queue
-    descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
-    os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
+    os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_ch.descriptor
 
     ddict = DDict(1, 2, 4 * 1024**2)
     dragon_fs = DragonFeatureStore(ddict)
@@ -216,15 +230,14 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
     )
-    integrated_worker_type = TorchWorker
 
     request_dispatcher = RequestDispatcher(
         batch_timeout=0,
         batch_size=2,
         config_loader=config_loader,
-        worker_type=integrated_worker_type,
+        worker_type=TorchWorker,
         mem_pool_size=2 * 1024**2,
     )
 
@@ -241,9 +254,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
         batch: t.Optional[RequestBatch] = None
         mem_allocs = []
         tensors = []
-        fs_path = test_path / f"feature_store"
-        comm_path = test_path / f"comm_store"
-        model_key = str(fs_path / "model_fs.pt")
+        model_key = "mini-model"
 
         # create a mock client application to populate the request queue
         msg_pump = mp.Process(
@@ -251,8 +262,6 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
             args=(
                 worker_queue,
                 dragon_fs,
-                fs_path,
-                comm_path,
             ),
         )
 
@@ -260,7 +269,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
 
         time.sleep(1)
 
-        for attempts in range(15):
+        for _ in range(15):
             try:
                 request_dispatcher._on_iteration()
                 batch = request_dispatcher.task_queue.get(timeout=1)
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index 1ebc512a5..43b8cc7ec 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -1,218 +1,339 @@
-# BSD 2-Clause License
-#
-# Copyright (c) 2021-2024, Hewlett Packard Enterprise
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import io
-import logging
-import pathlib
-import time
-
-import pytest
-
-torch = pytest.importorskip("torch")
-dragon = pytest.importorskip("dragon")
-
-import base64
-import multiprocessing as mp
-
-try:
-    mp.set_start_method("dragon")
-except Exception:
-    pass
-
-import os
-
-import dragon.channels as dch
-from dragon import fli
-from dragon.mpbridge.queues import DragonQueue
-
-from smartsim._core.mli.comm.channel.channel import CommChannelBase
-from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
-from smartsim._core.mli.infrastructure.control.worker_manager import (
-    EnvironmentConfigLoader,
-    WorkerManager,
-)
-from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
-    DragonFeatureStore,
-)
-from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore
-from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
-from smartsim._core.mli.message_handler import MessageHandler
-from smartsim.log import get_logger
-
-from .feature_store import FileSystemFeatureStore
-from .utils.channel import FileSystemCommChannel
-
-logger = get_logger(__name__)
-# The tests in this file belong to the dragon group
-pytestmark = pytest.mark.dragon
-
-
-def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
-    """Create a simple torch model and persist to disk for
-    testing purposes.
-
-    TODO: remove once unit tests are in place"""
-    # test_path = pathlib.Path(work_dir)
-    if not model_path.parent.exists():
-        model_path.parent.mkdir(parents=True, exist_ok=True)
-
-    model_path.unlink(missing_ok=True)
-    # model_path = test_path / "basic.pt"
-
-    model = torch.nn.Linear(2, 1)
-    torch.save(model, model_path)
-
-    return model_path
-
-
-def mock_messages(
-    worker_manager_queue: CommChannelBase,
-    feature_store: FeatureStore,
-    feature_store_root_dir: pathlib.Path,
-    comm_channel_root_dir: pathlib.Path,
-) -> None:
-    """Mock event producer for triggering the inference pipeline"""
-    feature_store_root_dir.mkdir(parents=True, exist_ok=True)
-    comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
-
-    model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt")
-    model_bytes = model_path.read_bytes()
-    model_key = str(feature_store_root_dir / "model_fs.pt")
-
-    feature_store[model_key] = model_bytes
-
-    iteration_number = 0
-
-    while True:
-        iteration_number += 1
-        time.sleep(1)
-        # 1. for demo, ignore upstream and just put stuff into downstream
-        # 2. for demo, only one downstream but we'd normally have to filter
-        #       msg content and send to the correct downstream (worker) queue
-        # timestamp = time.time_ns()
-        # mock_channel = test_path / f"brainstorm-{timestamp}.txt"
-        # mock_channel.touch()
-
-        # thread - just look for key (wait for keys)
-        # call checkpoint, try to get non-persistent key, it blocks
-        # working set size > 1 has side-effects
-        # only incurs cost when working set size has been exceeded
-
-        channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt"
-        callback_channel = FileSystemCommChannel(pathlib.Path(channel_key))
-
-        input_path = feature_store_root_dir / f"{iteration_number}/input.pt"
-        output_path = feature_store_root_dir / f"{iteration_number}/output.pt"
-
-        input_key = str(input_path)
-        output_key = str(output_path)
-
-        buffer = io.BytesIO()
-        tensor = torch.randn((1, 2), dtype=torch.float32)
-        torch.save(tensor, buffer)
-        feature_store[input_key] = buffer.getvalue()
-        fsd = feature_store.descriptor
-
-        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
-        message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd)
-        message_model_key = MessageHandler.build_model_key(model_key, fsd)
-
-        request = MessageHandler.build_request(
-            reply_channel=callback_channel.descriptor,
-            model=message_model_key,
-            inputs=[message_tensor_input_key],
-            outputs=[message_tensor_output_key],
-            output_descriptors=[],
-            custom_attributes=None,
-        )
-        request_bytes = MessageHandler.serialize_request(request)
-        worker_manager_queue.send(request_bytes)
-
-
-@pytest.fixture
-def prepare_environment(test_dir: str) -> pathlib.Path:
-    """Cleanup prior outputs to run demo repeatedly"""
-    path = pathlib.Path(f"{test_dir}/workermanager.log")
-    logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
-    return path
-
-
-def test_worker_manager(prepare_environment: pathlib.Path) -> None:
-    """Test the worker manager"""
-
-    test_path = prepare_environment
-    fs_path = test_path / "feature_store"
-    comm_path = test_path / "comm_store"
-
-    to_worker_channel = dch.Channel.make_process_local()
-    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-    to_worker_fli_serialized = to_worker_fli.serialize()
-
-    # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
-    # or test environment may be unable to send messages w/queue
-    descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
-    os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
-
-    config_loader = EnvironmentConfigLoader(
-        featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_descriptor,
-    )
-    integrated_worker_type = TorchWorker
-
-    worker_manager = WorkerManager(
-        config_loader,
-        integrated_worker_type,
-        as_service=True,
-        cooldown=5,
-        device="cpu",
-        dispatcher_queue=mp.Queue(maxsize=0),
-    )
-
-    worker_queue = config_loader.get_queue()
-    if worker_queue is None:
-        logger.warn(
-            f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}"
-        )
-
-    # create a mock client application to populate the request queue
-    msg_pump = mp.Process(
-        target=mock_messages,
-        args=(
-            worker_queue,
-            FileSystemFeatureStore(fs_path),
-            fs_path,
-            comm_path,
-        ),
-    )
-    msg_pump.start()
-
-    # create a process to execute commands
-    process = mp.Process(target=worker_manager.execute)
-    process.start()
-    process.join(timeout=5)
-    process.kill()
-    msg_pump.kill()
+# # BSD 2-Clause License
+# #
+# # Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# # All rights reserved.
+# #
+# # Redistribution and use in source and binary forms, with or without
+# # modification, are permitted provided that the following conditions are met:
+# #
+# # 1. Redistributions of source code must retain the above copyright notice, this
+# #    list of conditions and the following disclaimer.
+# #
+# # 2. Redistributions in binary form must reproduce the above copyright notice,
+# #    this list of conditions and the following disclaimer in the documentation
+# #    and/or other materials provided with the distribution.
+# #
+# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# import io
+# import logging
+# import pathlib
+# import time
+
+# import pytest
+
+# torch = pytest.importorskip("torch")
+# dragon = pytest.importorskip("dragon")
+
+# import multiprocessing as mp
+
+# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+#     BackboneFeatureStore,
+# )
+# from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import OutputDescriptor
+
+# try:
+#     mp.set_start_method("dragon")
+# except Exception:
+#     pass
+
+# import os
+
+# import dragon.channels as dch
+# import torch.nn as nn
+# from dragon import fli
+# from dragon.data.ddict.ddict import DDict
+
+# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+# from smartsim._core.mli.infrastructure.control.worker_manager import (
+#     EnvironmentConfigLoader,
+#     WorkerManager,
+# )
+# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+#     DragonFeatureStore,
+# )
+# from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+# from smartsim._core.mli.message_handler import MessageHandler
+# from smartsim.log import get_logger
+
+# from .utils.channel import FileSystemCommChannel
+
+# logger = get_logger(__name__)
+# # The tests in this file belong to the dragon group
+# pytestmark = pytest.mark.dragon
+
+
+# class MiniModel(nn.Module):
+#     def __init__(self):
+#         super().__init__()
+
+#         self._name = "mini-model"
+#         self._net = torch.nn.Linear(2, 1)
+
+#     def forward(self, input):
+#         return self._net(input)
+
+#     @property
+#     def bytes(self) -> bytes:
+#         """Returns the model serialized to a byte stream"""
+#         buffer = io.BytesIO()
+#         scripted = torch.jit.trace(self._net, self.get_batch())
+#         torch.jit.save(scripted, buffer)
+#         return buffer.getvalue()
+
+#     @classmethod
+#     def get_batch(cls) -> "torch.Tensor":
+#         return torch.randn((100, 2), dtype=torch.float32)
+
+
+# def create_model(model_path: pathlib.Path) -> pathlib.Path:
+#     """Create a simple torch model and persist to disk for
+#     testing purposes.
+
+#     TODO: remove once unit tests are in place"""
+#     if not model_path.parent.exists():
+#         model_path.parent.mkdir(parents=True, exist_ok=True)
+
+#     model_path.unlink(missing_ok=True)
+
+#     mini_model = MiniModel()
+#     torch.save(mini_model, model_path)
+
+#     return model_path
+
+
+# def load_model() -> bytes:
+#     """Create a simple torch model in memory for testing"""
+#     mini_model = MiniModel()
+#     return mini_model.bytes
+
+
+# def mock_messages(
+#     feature_store_root_dir: pathlib.Path,
+#     comm_channel_root_dir: pathlib.Path,
+#     kill_queue: mp.Queue,
+# ) -> None:
+#     """Mock event producer for triggering the inference pipeline"""
+#     feature_store_root_dir.mkdir(parents=True, exist_ok=True)
+#     comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
+
+#     iteration_number = 0
+
+#     config_loader = EnvironmentConfigLoader(
+#         featurestore_factory=DragonFeatureStore.from_descriptor,
+#         callback_factory=FileSystemCommChannel.from_descriptor,
+#         queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+#     )
+#     backbone = config_loader.get_backbone()
+
+#     worker_queue = config_loader.get_queue()
+#     if worker_queue is None:
+#         queue_desc = config_loader._queue_descriptor
+#         logger.warn(
+#             f"FLI input queue not loaded correctly from config_loader: {queue_desc}"
+#         )
+
+#     model_key = "mini-model"
+#     model_bytes = load_model()
+#     backbone[model_key] = model_bytes
+
+#     message_model_key = MessageHandler.build_feature_store_key(
+#         model_key, backbone.descriptor
+#     )
+
+#     while True:
+#         if not kill_queue.empty():
+#             return
+#         iteration_number += 1
+#         time.sleep(1)
+#         # 1. for demo, ignore upstream and just put stuff into downstream
+#         # 2. for demo, only one downstream but we'd normally have to filter
+#         #       msg content and send to the correct downstream (worker) queue
+#         # timestamp = time.time_ns()
+#         # mock_channel = test_path / f"brainstorm-{timestamp}.txt"
+#         # mock_channel.touch()
+
+#         # thread - just look for key (wait for keys)
+#         # call checkpoint, try to get non-persistent key, it blocks
+#         # working set size > 1 has side-effects
+#         # only incurs cost when working set size has been exceeded
+
+#         channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt"
+#         callback_channel = FileSystemCommChannel(pathlib.Path(channel_key))
+
+#         # input_key = f"my-input-{iteration_number}"
+#         output_key = f"my-output-{iteration_number}"
+
+#         batch = MiniModel.get_batch()
+#         shape = batch.shape
+#         batch_bytes = batch.numpy().tobytes()
+#         # backbone[input_key] = batch_bytes
+
+#         logger.debug(f"Model content: {backbone[model_key][:20]}")
+#         # logger.debug(f"Input content: {backbone[input_key][:20]}")
+
+#         fsd = backbone.descriptor
+
+#         # message_tensor_output_key = MessageHandler.build_feature_store_key(
+#         #     output_key, fsd
+#         # )
+#         # message_tensor_input_key = MessageHandler.build_feature_store_key(
+#         #     input_key, fsd
+#         # )
+
+#         input_descriptor = MessageHandler.build_tensor_descriptor(
+#             "f", "float32", list(shape)
+#         )
+
+#         # output_descriptor = MessageHandler.build_output_tensor_descriptor(
+#         #     "f", [], "float32", list(shape)
+#         # )
+
+#         # The first request is always the metadata...
+#         request = MessageHandler.build_request(
+#             reply_channel=callback_channel.descriptor,
+#             # model=message_model_key,
+#             model=MessageHandler.build_model(model_bytes, "mini-model", "1.0"),
+#             # inputs=[message_tensor_input_key],
+#             inputs=[input_descriptor],
+#             # outputs=[message_tensor_output_key],
+#             outputs=[],
+#             # output_descriptors=[output_descriptor],
+#             output_descriptors=[],
+#             custom_attributes=None,
+#         )
+#         request_bytes = MessageHandler.serialize_request(request)
+#         fli: DragonFLIChannel = worker_queue
+
+#         with fli._fli.sendh(timeout=None, stream_channel=fli._channel) as sendh:
+#             sendh.send_bytes(request_bytes)
+#             sendh.send_bytes(batch_bytes)
+
+#         # worker_queue.send(request_bytes)
+#         # follow up with the actual data
+#         # worker_queue.send(batch_bytes)
+
+#         logger.info("published message")
+
+#         if iteration_number > 5:
+#             return
+
+
+# def mock_mli_infrastructure_mgr():
+#     config_loader = EnvironmentConfigLoader(
+#         featurestore_factory=DragonFeatureStore.from_descriptor,
+#         callback_factory=FileSystemCommChannel.from_descriptor,
+#         queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+#     )
+
+#     integrated_worker = TorchWorker
+
+#     worker_manager = WorkerManager(
+#         config_loader,
+#         integrated_worker,
+#         as_service=True,
+#         cooldown=10,
+#         device="cpu",
+#         dispatcher_queue=mp.Queue(maxsize=0),
+#     )
+#     worker_manager.execute()
+
+
+# @pytest.fixture
+# def prepare_environment(test_dir: str) -> pathlib.Path:
+#     """Cleanup prior outputs to run demo repeatedly"""
+#     path = pathlib.Path(f"{test_dir}/workermanager.log")
+#     logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
+#     return path
+
+
+# def test_worker_manager(prepare_environment: pathlib.Path) -> None:
+#     """Test the worker manager"""
+
+#     test_path = prepare_environment
+#     fs_path = test_path / "feature_store"
+#     comm_path = test_path / "comm_store"
+
+#     # old instantiation code start
+#     # to_worker_channel = dch.Channel.make_process_local()
+#     # to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+#     # to_worker_fli_serialized = to_worker_fli.serialize()
+
+#     # # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
+#     # # or test environment may be unable to send messages w/queue
+#     # descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+#     # os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
+
+#     mgr_per_node = 1
+#     num_nodes = 2
+#     mem_per_node = 1024**3
+#     total_mem = num_nodes * mem_per_node
+
+#     storage = DDict(
+#         managers_per_node=mgr_per_node,
+#         n_nodes=num_nodes,
+#         total_mem=total_mem,
+#     )
+#     backbone = BackboneFeatureStore(storage, allow_reserved_writes=True)
+
+#     to_worker_channel = dch.Channel.make_process_local()
+#     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+
+#     to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, sender_supplied=True)
+
+#     # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader
+#     # or test environment may be unable to send messages w/queue
+#     os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor
+#     os.environ["_SMARTSIM_INFRA_BACKBONE"] = backbone.descriptor
+
+#     config_loader = EnvironmentConfigLoader(
+#         featurestore_factory=DragonFeatureStore.from_descriptor,
+#         callback_factory=FileSystemCommChannel.from_descriptor,
+#         queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+#     )
+#     integrated_worker_type = TorchWorker
+
+#     worker_manager = WorkerManager(
+#         config_loader,
+#         integrated_worker_type,
+#         as_service=True,
+#         cooldown=5,
+#         device="cpu",
+#         dispatcher_queue=mp.Queue(maxsize=0),
+#     )
+
+#     worker_queue = config_loader.get_queue()
+#     if worker_queue is None:
+#         logger.warn(
+#             f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}"
+#         )
+#     backbone.worker_queue = to_worker_fli_comm_channel.descriptor
+
+#     # create a mock client application to populate the request queue
+#     kill_queue = mp.Queue()
+#     msg_pump = mp.Process(
+#         target=mock_messages,
+#         args=(fs_path, comm_path, kill_queue),
+#     )
+#     msg_pump.start()
+
+#     # create a process to execute commands
+#     process = mp.Process(target=mock_mli_infrastructure_mgr)
+
+#     # let it send some messages before starting the worker manager
+#     msg_pump.join(timeout=5)
+#     process.start()
+#     msg_pump.join(timeout=5)
+#     kill_queue.put_nowait("kill!")
+#     process.join(timeout=5)
+#     msg_pump.kill()
+#     process.kill()
diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py
index 6cde6258f..09e1703bc 100644
--- a/tests/dragon/utils/channel.py
+++ b/tests/dragon/utils/channel.py
@@ -39,17 +39,14 @@
 class FileSystemCommChannel(CommChannelBase):
     """Passes messages by writing to a file"""
 
-    def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
+    def __init__(self, key: pathlib.Path) -> None:
         """Initialize the FileSystemCommChannel instance
 
         :param key: a path to the root directory of the feature store"""
         self._lock = threading.RLock()
-        if not isinstance(key, bytes):
-            super().__init__(key.as_posix().encode("utf-8"))
-            self._file_path = key
-        else:
-            super().__init__(key)
-            self._file_path = pathlib.Path(key.decode("utf-8"))
+
+        super().__init__(key.as_posix())
+        self._file_path = key
 
         if not self._file_path.parent.exists():
             self._file_path.parent.mkdir(parents=True)
@@ -110,17 +107,14 @@ def clear(self) -> None:
     @classmethod
     def from_descriptor(
         cls,
-        descriptor: t.Union[str, bytes],
+        descriptor: str,
     ) -> "FileSystemCommChannel":
         """A factory method that creates an instance from a descriptor string
 
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached FileSystemCommChannel"""
         try:
-            if isinstance(descriptor, str):
-                path = pathlib.Path(descriptor)
-            else:
-                path = pathlib.Path(descriptor.decode("utf-8"))
+            path = pathlib.Path(descriptor)
             return FileSystemCommChannel(path)
         except:
             logger.warning(f"failed to create fs comm channel: {descriptor!r}")
diff --git a/tests/mli/channel.py b/tests/mli/channel.py
index 234878423..b00ba9aa2 100644
--- a/tests/mli/channel.py
+++ b/tests/mli/channel.py
@@ -39,17 +39,14 @@
 class FileSystemCommChannel(CommChannelBase):
     """Passes messages by writing to a file"""
 
-    def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
+    def __init__(self, key: pathlib.Path) -> None:
         """Initialize the FileSystemCommChannel instance
 
         :param key: a path to the root directory of the feature store"""
         self._lock = threading.RLock()
-        if isinstance(key, pathlib.Path):
-            super().__init__(key.as_posix().encode("utf-8"))
-            self._file_path = key
-        else:
-            super().__init__(key)
-            self._file_path = pathlib.Path(key.decode("utf-8"))
+
+        super().__init__(key.as_posix())
+        self._file_path = key
 
         if not self._file_path.parent.exists():
             self._file_path.parent.mkdir(parents=True)
@@ -110,17 +107,14 @@ def clear(self) -> None:
     @classmethod
     def from_descriptor(
         cls,
-        descriptor: t.Union[str, bytes],
+        descriptor: str,
     ) -> "FileSystemCommChannel":
         """A factory method that creates an instance from a descriptor string
 
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached FileSystemCommChannel"""
         try:
-            if isinstance(descriptor, str):
-                path = pathlib.Path(descriptor)
-            else:
-                path = pathlib.Path(descriptor.decode("utf-8"))
+            path = pathlib.Path(descriptor)
             return FileSystemCommChannel(path)
         except:
             logger.warning(f"failed to create fs comm channel: {descriptor}")
diff --git a/tests/mli/test_integrated_torch_worker.py b/tests/mli/test_integrated_torch_worker.py
index 60f1f0c6b..67a9a4a9b 100644
--- a/tests/mli/test_integrated_torch_worker.py
+++ b/tests/mli/test_integrated_torch_worker.py
@@ -106,9 +106,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path:
 
 #     output_key = f"demo-output"
 
-#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
-#     message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
-#     message_model_key = MessageHandler.build_model_key(model_key)
+#     message_tensor_output_key = MessageHandler.build_feature_store_key(output_key)
+#     message_tensor_input_key = MessageHandler.build_feature_store_key(input_key)
+#     message_model_key = MessageHandler.build_feature_store_key(model_key)
 
 #     request = MessageHandler.build_request(
 #         reply_channel=callback_channel.descriptor,
@@ -146,9 +146,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path:
 
 #     output_key = f"demo-output"
 
-#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
-#     message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
-#     # message_model_key = MessageHandler.build_model_key(model_key)
+#     message_tensor_output_key = MessageHandler.build_feature_store_key(output_key)
+#     message_tensor_input_key = MessageHandler.build_feature_store_key(input_key)
+#     # message_model_key = MessageHandler.build_feature_store_key(model_key)
 
 #     request = MessageHandler.build_request(
 #         reply_channel=callback_channel.descriptor,
@@ -187,9 +187,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path:
 
 #     output_key = f"demo-output"
 
-#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
-#     # message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
-#     # message_model_key = MessageHandler.build_model_key(model_key)
+#     message_tensor_output_key = MessageHandler.build_feature_store_key(output_key)
+#     # message_tensor_input_key = MessageHandler.build_feature_store_key(input_key)
+#     # message_model_key = MessageHandler.build_feature_store_key(model_key)
 #     message_tensor_input = MessageHandler.build_tensor(
 #         input_tensor, "c", "float32", [2]
 #     )
@@ -231,9 +231,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path:
 
 #     output_key = f"demo-output"
 
-#     # message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
-#     # message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
-#     message_model_key = MessageHandler.build_model_key(model_key)
+#     # message_tensor_output_key = MessageHandler.build_feature_store_key(output_key)
+#     # message_tensor_input_key = MessageHandler.build_feature_store_key(input_key)
+#     message_model_key = MessageHandler.build_feature_store_key(model_key)
 #     message_tensor_input = MessageHandler.build_tensor(
 #         input_tensor, "c", "float32", [2]
 #     )
diff --git a/tests/test_featurestore.py b/tests/test_featurestore.py
new file mode 100644
index 000000000..f0b122bcf
--- /dev/null
+++ b/tests/test_featurestore.py
@@ -0,0 +1,711 @@
+# # BSD 2-Clause License
+# #
+# # Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# # All rights reserved.
+# #
+# # Redistribution and use in source and binary forms, with or without
+# # modification, are permitted provided that the following conditions are met:
+# #
+# # 1. Redistributions of source code must retain the above copyright notice, this
+# #    list of conditions and the following disclaimer.
+# #
+# # 2. Redistributions in binary form must reproduce the above copyright notice,
+# #    this list of conditions and the following disclaimer in the documentation
+# #    and/or other materials provided with the distribution.
+# #
+# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# import pathlib
+# import time
+# import typing as t
+# import unittest.mock as mock
+
+# import pytest
+
+# dragon = pytest.importorskip("dragon")
+
+# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+#     BackboneFeatureStore,
+#     EventBroadcaster,
+#     EventCategory,
+#     EventConsumer,
+#     OnCreateConsumer,
+#     OnWriteFeatureStore,
+# )
+# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+#     time as bbtime,
+# )
+# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+#     DragonFeatureStore,
+# )
+# from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys
+# from smartsim.error import SmartSimError
+# from tests.mli.channel import FileSystemCommChannel
+# from tests.mli.feature_store import MemoryFeatureStore
+
+# if t.TYPE_CHECKING:
+#     import conftest
+
+
+# # The tests in this file belong to the group_a group
+# pytestmark = pytest.mark.group_a
+
+# WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
+# RANDOMLY_SET_KEY = "_SOMETHING_ELSE"
+
+
+# @pytest.fixture
+# def storage_for_dragon_fs_with_req_queue() -> t.Dict[str, str]:
+#     storage = {WORK_QUEUE_KEY: "12345", RANDOMLY_SET_KEY: "67890"}
+#     return storage
+
+
+# def boom(*args, **kwargs) -> None:
+#     """Helper function that blows up when used to mock up
+#     some other function"""
+#     raise Exception(f"you shall not pass! {args}, {kwargs}")
+
+
+# def test_event_uid() -> None:
+#     """Verify that all events include a unique identifier"""
+#     uids: t.Set[str] = set()
+#     num_iters = 1000
+
+#     # generate a bunch of events and keep track all the IDs
+#     for i in range(num_iters):
+#         event_a = OnCreateConsumer(str(i), [])
+#         event_b = OnWriteFeatureStore(str(i), "key")
+
+#         uids.add(event_a.uid)
+#         uids.add(event_b.uid)
+
+#     # verify each event created a unique ID
+#     assert len(uids) == 2 * num_iters
+
+
+# def test_mli_reserved_keys_conversion() -> None:
+#     """Verify that conversion from a string to an enum member
+#     works as expected"""
+
+#     for reserved_key in ReservedKeys:
+#         # iterate through all keys and verify `from_string` works
+#         assert ReservedKeys.contains(reserved_key.value)
+
+#         # show that the value (actual key) not the enum member name
+#         # will not be incorrectly identified as reserved
+#         assert not ReservedKeys.contains(str(reserved_key).split(".")[1])
+
+
+# def test_mli_reserved_keys_writes() -> None:
+#     """Verify that attempts to write to reserved keys are blocked from a
+#     standard DragonFeatureStore but enabled with the BackboneFeatureStore"""
+
+#     mock_storage = {}
+#     dfs = DragonFeatureStore(mock_storage)
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     other = MemoryFeatureStore(mock_storage)
+
+#     expected_value = "value"
+
+#     for reserved_key in ReservedKeys:
+#         # we expect every reserved key to fail using DragonFeatureStore...
+#         with pytest.raises(SmartSimError) as ex:
+#             dfs[reserved_key] = expected_value
+
+#         assert "reserved key" in ex.value.args[0]
+
+#         # ... and expect other feature stores to respect reserved keys
+#         with pytest.raises(SmartSimError) as ex:
+#             other[reserved_key] = expected_value
+
+#         assert "reserved key" in ex.value.args[0]
+
+#         # ...and those same keys to succeed on the backbone
+#         backbone[reserved_key] = expected_value
+#         actual_value = backbone[reserved_key]
+#         assert actual_value == expected_value
+
+
+# def test_mli_consumers_read_by_key() -> None:
+#     """Verify that the value returned from the mli consumers
+#     method is written to the correct key and reads are
+#     allowed via standard dragon feature store.
+#     NOTE: should reserved reads also be blocked"""
+
+#     mock_storage = {}
+#     dfs = DragonFeatureStore(mock_storage)
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     other = MemoryFeatureStore(mock_storage)
+
+#     expected_value = "value"
+
+#     # write using backbone that has permission to write reserved keys
+#     backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value
+
+#     # confirm read-only access to reserved keys from any FeatureStore
+#     for fs in [dfs, backbone, other]:
+#         assert fs[ReservedKeys.MLI_NOTIFY_CONSUMERS] == expected_value
+
+
+# def test_mli_consumers_read_by_backbone() -> None:
+#     """Verify that the backbone reads the correct location
+#     when using the backbone feature store API instead of mapping API"""
+
+#     mock_storage = {}
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     expected_value = "value"
+
+#     backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value
+
+#     # confirm reading via convenience method returns expected value
+#     assert backbone.notification_channels[0] == expected_value
+
+
+# def test_mli_consumers_write_by_backbone() -> None:
+#     """Verify that the backbone writes the correct location
+#     when using the backbone feature store API instead of mapping API"""
+
+#     mock_storage = {}
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     expected_value = ["value"]
+
+#     backbone.notification_channels = expected_value
+
+#     # confirm write using convenience method targets expected key
+#     assert backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] == ",".join(expected_value)
+
+
+# def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None:
+#     """Verify that a broadcast operation without any registered subscribers
+#     succeeds without raising Exceptions
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     mock_storage = {}
+#     consumer_descriptor = storage_path / "test-consumer"
+
+#     # NOTE: we're not putting any consumers into the backbone here!
+#     backbone = BackboneFeatureStore(mock_storage)
+
+#     event = OnCreateConsumer(consumer_descriptor, [])
+
+#     publisher = EventBroadcaster(backbone)
+#     num_receivers = 0
+
+#     # publishing this event without any known consumers registered should succeed
+#     # but report that it didn't have anybody to send the event to
+#     consumer_descriptor = storage_path / f"test-consumer"
+#     event = OnCreateConsumer(consumer_descriptor, [])
+
+#     num_receivers += publisher.send(event)
+
+#     # confirm no changes to the backbone occur when fetching the empty consumer key
+#     key_in_features_store = ReservedKeys.MLI_NOTIFY_CONSUMERS in backbone
+#     assert not key_in_features_store
+
+#     # confirm that the broadcast reports no events published
+#     assert num_receivers == 0
+#     # confirm that the broadcast buffered the event for a later send
+#     assert publisher.num_buffered == 1
+
+
+# def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None:
+#     """Verify that a broadcast operation without any registered subscribers
+#     succeeds without raising Exceptions
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     mock_storage = {}
+
+#     # note: file-system descriptors are just paths
+#     consumer_descriptor = storage_path / "test-consumer"
+
+#     # prep our backbone with a consumer list
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     backbone.notification_channels = []
+
+#     event = OnCreateConsumer(consumer_descriptor, [])
+#     publisher = EventBroadcaster(
+#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
+#     )
+#     num_receivers = publisher.send(event)
+
+#     registered_consumers = backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS]
+
+#     # confirm that no consumers exist in backbone to send to
+#     assert not registered_consumers
+#     # confirm that the broadcast reports no events published
+#     assert num_receivers == 0
+#     # confirm that the broadcast buffered the event for a later send
+#     assert publisher.num_buffered == 1
+
+
+# def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None:
+#     """Verify that a broadcast operation reports an error if no channel
+#     factory was supplied for constructing the consumer channels
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     mock_storage = {}
+
+#     # note: file-system descriptors are just paths
+#     consumer_descriptor = storage_path / "test-consumer"
+
+#     # prep our backbone with a consumer list
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     backbone.notification_channels = [consumer_descriptor]
+
+#     event = OnCreateConsumer(consumer_descriptor, [])
+#     publisher = EventBroadcaster(
+#         backbone,
+#         # channel_factory=FileSystemCommChannel.from_descriptor # <--- not supplied
+#     )
+
+#     with pytest.raises(SmartSimError) as ex:
+#         publisher.send(event)
+
+#     assert "factory" in ex.value.args[0]
+
+
+# def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None:
+#     """Verify that a successful broadcast clears messages from the event
+#     buffer when a new message is sent and consumers are registered
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     mock_storage = {}
+
+#     # note: file-system descriptors are just paths
+#     consumer_descriptor = storage_path / "test-consumer"
+
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     backbone.notification_channels = (consumer_descriptor,)
+
+#     publisher = EventBroadcaster(
+#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
+#     )
+
+#     # mock building up some buffered events
+#     num_buffered_events = 14
+#     for i in range(num_buffered_events):
+#         event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", [])
+#         publisher._event_buffer.append(bytes(event))
+
+#     event0 = OnCreateConsumer(
+#         storage_path / f"test-consumer-{str(num_buffered_events + 1)}", []
+#     )
+
+#     num_receivers = publisher.send(event0)
+#     # 1 receiver x 15 total events == 15 events
+#     assert num_receivers == num_buffered_events + 1
+
+
+# @pytest.mark.parametrize(
+#     "num_consumers, num_buffered, expected_num_sent",
+#     [
+#         pytest.param(0, 7, 0, id="0 x (7+1) - no consumers, multi-buffer"),
+#         pytest.param(1, 7, 8, id="1 x (7+1) - single consumer, multi-buffer"),
+#         pytest.param(2, 7, 16, id="2 x (7+1) - multi-consumer, multi-buffer"),
+#         pytest.param(4, 4, 20, id="4 x (4+1) - multi-consumer, multi-buffer (odd #)"),
+#         pytest.param(9, 0, 9, id="13 x (0+1) - multi-consumer, empty buffer"),
+#     ],
+# )
+# def test_eventpublisher_broadcast_returns_total_sent(
+#     test_dir: str, num_consumers: int, num_buffered: int, expected_num_sent: int
+# ) -> None:
+#     """Verify that a successful broadcast returns the total number of events
+#     sent, including buffered messages.
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs
+#     :param num_consumers: the number of consumers to mock setting up prior to send
+#     :param num_buffered: the number of pre-buffered events to mock up
+#     :param expected_num_sent: the expected result from calling send
+#     """
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     mock_storage = {}
+
+#     # note: file-system descriptors are just paths
+#     consumers = []
+#     for i in range(num_consumers):
+#         consumers.append(storage_path / f"test-consumer-{i}")
+
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     backbone.notification_channels = consumers
+
+#     publisher = EventBroadcaster(
+#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
+#     )
+
+#     # mock building up some buffered events
+#     for i in range(num_buffered):
+#         event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", [])
+#         publisher._event_buffer.append(bytes(event))
+
+#     assert publisher.num_buffered == num_buffered
+
+#     # this event will trigger clearing anything already in buffer
+#     event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}", [])
+
+#     # num_receivers should contain a number that computes w/all consumers and all events
+#     num_receivers = publisher.send(event0)
+
+#     assert num_receivers == expected_num_sent
+
+
+# def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None:
+#     """Verify that any unused consumers are pruned each time a new event is sent
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     mock_storage = {}
+
+#     # note: file-system descriptors are just paths
+#     consumer_descriptor = storage_path / "test-consumer"
+
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+
+#     publisher = EventBroadcaster(
+#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
+#     )
+
+#     event = OnCreateConsumer(consumer_descriptor, [])
+
+#     # the only registered cnosumer is in the event, expect no pruning
+#     backbone.notification_channels = (consumer_descriptor,)
+
+#     publisher.send(event)
+#     assert str(consumer_descriptor) in publisher._channel_cache
+#     assert len(publisher._channel_cache) == 1
+
+#     # add a new descriptor for another event...
+#     consumer_descriptor2 = storage_path / "test-consumer-2"
+#     # ... and remove the old descriptor from the backbone when it's looked up
+#     backbone.notification_channels = (consumer_descriptor2,)
+
+#     event = OnCreateConsumer(consumer_descriptor2, [])
+
+#     publisher.send(event)
+
+#     assert str(consumer_descriptor2) in publisher._channel_cache
+#     assert str(consumer_descriptor) not in publisher._channel_cache
+#     assert len(publisher._channel_cache) == 1
+
+#     # test multi-consumer pruning by caching some extra channels
+#     prune0, prune1, prune2 = "abc", "def", "ghi"
+#     publisher._channel_cache[prune0] = "doesnt-matter-if-it-is-pruned"
+#     publisher._channel_cache[prune1] = "doesnt-matter-if-it-is-pruned"
+#     publisher._channel_cache[prune2] = "doesnt-matter-if-it-is-pruned"
+
+#     # add in one of our old channels so we prune the above items, send to these
+#     backbone.notification_channels = (consumer_descriptor, consumer_descriptor2)
+
+#     publisher.send(event)
+
+#     assert str(consumer_descriptor2) in publisher._channel_cache
+
+#     # NOTE: we should NOT prune something that isn't used by this message but
+#     # does appear in `backbone.notification_channels`
+#     assert str(consumer_descriptor) in publisher._channel_cache
+
+#     # confirm all of our items that were not in the notification channels are gone
+#     for pruned in [prune0, prune1, prune2]:
+#         assert pruned not in publisher._channel_cache
+
+#     # confirm we have only the two expected items in the channel cache
+#     assert len(publisher._channel_cache) == 2
+
+
+# def test_eventpublisher_serialize_failure(
+#     test_dir: str, monkeypatch: pytest.MonkeyPatch
+# ) -> None:
+#     """Verify that errors during message serialization are raised to the caller
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs
+#     :param monkeypatch: pytest fixture for modifying behavior of existing code
+#     with mock implementations"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     storage_path.mkdir(parents=True, exist_ok=True)
+
+#     mock_storage = {}
+
+#     # note: file-system descriptors are just paths
+#     target_descriptor = str(storage_path / "test-consumer")
+
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     publisher = EventBroadcaster(
+#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
+#     )
+
+#     with monkeypatch.context() as patch:
+#         event = OnCreateConsumer(target_descriptor, [])
+
+#         # patch the __bytes__ implementation to cause pickling to fail during send
+#         patch.setattr(event, "__bytes__", lambda x: b"abc")
+
+#         backbone.notification_channels = (target_descriptor,)
+
+#         # send a message into the channel
+#         with pytest.raises(ValueError) as ex:
+#             publisher.send(event)
+
+#         assert "serialize" in ex.value.args[0]
+
+
+# def test_eventpublisher_factory_failure(
+#     test_dir: str, monkeypatch: pytest.MonkeyPatch
+# ) -> None:
+#     """Verify that errors during channel construction are raised to the caller
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs
+#     :param monkeypatch: pytest fixture for modifying behavior of existing code
+#     with mock implementations"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     storage_path.mkdir(parents=True, exist_ok=True)
+
+#     mock_storage = {}
+
+#     # note: file-system descriptors are just paths
+#     target_descriptor = str(storage_path / "test-consumer")
+
+#     def boom(descriptor: str) -> None:
+#         raise Exception(f"you shall not pass! {descriptor}")
+
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     publisher = EventBroadcaster(backbone, channel_factory=boom)
+
+#     with monkeypatch.context() as patch:
+#         event = OnCreateConsumer(target_descriptor, [])
+
+#         backbone.notification_channels = (target_descriptor,)
+
+#         # send a message into the channel
+#         with pytest.raises(SmartSimError) as ex:
+#             publisher.send(event)
+
+#         assert "construct" in ex.value.args[0]
+
+
+# def test_eventpublisher_failure(test_dir: str, monkeypatch: pytest.MonkeyPatch) -> None:
+#     """Verify that unexpected errors during message send are caught and wrapped in a
+#     SmartSimError so they are not propagated directly to the caller
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs
+#     :param monkeypatch: pytest fixture for modifying behavior of existing code
+#     with mock implementations"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     storage_path.mkdir(parents=True, exist_ok=True)
+
+#     mock_storage = {}
+
+#     # note: file-system descriptors are just paths
+#     target_descriptor = str(storage_path / "test-consumer")
+
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     publisher = EventBroadcaster(
+#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
+#     )
+
+#     def boom(self) -> None:
+#         raise Exception("That was unexpected...")
+
+#     with monkeypatch.context() as patch:
+#         event = OnCreateConsumer(target_descriptor, [])
+
+#         # patch the _broadcast implementation to cause send to fail after
+#         # after the event has been pickled
+#         patch.setattr(publisher, "_broadcast", boom)
+
+#         backbone.notification_channels = (target_descriptor,)
+
+#         # Here, we see the exception raised by broadcast that isn't expected
+#         # is not allowed directly out, and instead is wrapped in SmartSimError
+#         with pytest.raises(SmartSimError) as ex:
+#             publisher.send(event)
+
+#         assert "unexpected" in ex.value.args[0]
+
+
+# def test_eventconsumer_receive(test_dir: str) -> None:
+#     """Verify that a consumer retrieves a message from the given channel
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     storage_path.mkdir(parents=True, exist_ok=True)
+
+#     mock_storage = {}
+
+#     # note: file-system descriptors are just paths
+#     target_descriptor = str(storage_path / "test-consumer")
+
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor)
+#     event = OnCreateConsumer(target_descriptor, [])
+
+#     # simulate a sent event by writing directly to the input comm channel
+#     comm_channel.send(bytes(event))
+
+#     consumer = EventConsumer(comm_channel, backbone)
+
+#     all_received: t.List[OnCreateConsumer] = consumer.receive()
+#     assert len(all_received) == 1
+
+#     # verify we received the same event that was raised
+#     assert all_received[0].category == event.category
+#     assert all_received[0].descriptor == event.descriptor
+
+
+# @pytest.mark.parametrize("num_sent", [0, 1, 2, 4, 8, 16])
+# def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None:
+#     """Verify that a consumer retrieves multiple message from the given channel
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs
+#     :param num_sent: parameterized value used to vary the number of events
+#     that are enqueued and validations are checked at multiple queue sizes"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     storage_path.mkdir(parents=True, exist_ok=True)
+
+#     mock_storage = {}
+
+#     # note: file-system descriptors are just paths
+#     target_descriptor = str(storage_path / "test-consumer")
+
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor)
+
+#     # simulate multiple sent events by writing directly to the input comm channel
+#     for _ in range(num_sent):
+#         event = OnCreateConsumer(target_descriptor, [])
+#         comm_channel.send(bytes(event))
+
+#     consumer = EventConsumer(comm_channel, backbone)
+
+#     all_received: t.List[OnCreateConsumer] = consumer.receive()
+#     assert len(all_received) == num_sent
+
+
+# def test_eventconsumer_receive_empty(test_dir: str) -> None:
+#     """Verify that a consumer receiving an empty message ignores the
+#     message and continues processing
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     storage_path.mkdir(parents=True, exist_ok=True)
+
+#     mock_storage = {}
+
+#     # note: file-system descriptors are just paths
+#     target_descriptor = str(storage_path / "test-consumer")
+
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor)
+
+#     # simulate a sent event by writing directly to the input comm channel
+#     comm_channel.send(bytes(b""))
+
+#     consumer = EventConsumer(comm_channel, backbone)
+
+#     messages = consumer.receive()
+
+#     # the messages array should be empty
+#     assert not messages
+
+
+# def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None:
+#     """Verify that the publisher and consumer integrate as expected when
+#     multiple publishers and consumers are sending simultaneously.
+
+#     :param test_dir: pytest fixture automatically generating unique working
+#     directories for individual test outputs"""
+#     storage_path = pathlib.Path(test_dir) / "features"
+#     storage_path.mkdir(parents=True, exist_ok=True)
+
+#     mock_storage = {}
+#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
+#     mock_fs_descriptor = str(storage_path / f"mock-feature-store")
+
+#     wmgr_channel = FileSystemCommChannel(storage_path / "test-wmgr")
+#     capp_channel = FileSystemCommChannel(storage_path / "test-capp")
+#     back_channel = FileSystemCommChannel(storage_path / "test-backend")
+
+#     wmgr_consumer_descriptor = wmgr_channel.descriptor
+#     capp_consumer_descriptor = capp_channel.descriptor
+#     back_consumer_descriptor = back_channel.descriptor
+
+#     # create some consumers to receive messages
+#     wmgr_consumer = EventConsumer(
+#         wmgr_channel,
+#         backbone,
+#         filters=[EventCategory.FEATURE_STORE_WRITTEN],
+#     )
+#     capp_consumer = EventConsumer(
+#         capp_channel,
+#         backbone,
+#     )
+#     back_consumer = EventConsumer(
+#         back_channel,
+#         backbone,
+#         filters=[EventCategory.CONSUMER_CREATED],
+#     )
+
+#     # create some broadcasters to publish messages
+#     mock_worker_mgr = EventBroadcaster(
+#         backbone,
+#         channel_factory=FileSystemCommChannel.from_descriptor,
+#     )
+#     mock_client_app = EventBroadcaster(
+#         backbone,
+#         channel_factory=FileSystemCommChannel.from_descriptor,
+#     )
+
+#     # register all of the consumers even though the OnCreateConsumer really should
+#     # trigger its registration. event processing is tested elsewhere.
+#     backbone.notification_channels = [
+#         wmgr_consumer_descriptor,
+#         capp_consumer_descriptor,
+#         back_consumer_descriptor,
+#     ]
+
+#     # simulate worker manager sending a notification to backend that it's alive
+#     event_1 = OnCreateConsumer(wmgr_consumer_descriptor, [])
+#     mock_worker_mgr.send(event_1)
+
+#     # simulate the app updating a model a few times
+#     event_2 = OnWriteFeatureStore(mock_fs_descriptor, "key-1")
+#     event_3 = OnWriteFeatureStore(mock_fs_descriptor, "key-2")
+#     event_4 = OnWriteFeatureStore(mock_fs_descriptor, "key-1")
+
+#     mock_client_app.send(event_2)
+#     mock_client_app.send(event_3)
+#     mock_client_app.send(event_4)
+
+#     # worker manager should only get updates about feature update
+#     wmgr_messages = wmgr_consumer.receive()
+#     assert len(wmgr_messages) == 3
+
+#     # the backend should only receive messages about consumer creation
+#     back_messages = back_consumer.receive()
+#     assert len(back_messages) == 1
+
+#     # hypothetical app has no filters and will get all events
+#     app_messages = capp_consumer.receive()
+#     assert len(app_messages) == 4
diff --git a/tests/test_message_handler/test_build_model_key.py b/tests/test_message_handler/test_build_model_key.py
index c09c787fc..092ae4fe0 100644
--- a/tests/test_message_handler/test_build_model_key.py
+++ b/tests/test_message_handler/test_build_model_key.py
@@ -34,14 +34,14 @@
 handler = MessageHandler()
 
 
-def test_build_model_key_successful():
+def test_build_feature_store_key_successful():
     fsd = "mock-feature-store-descriptor"
-    model_key = handler.build_model_key("tensor_key", fsd)
+    model_key = handler.build_feature_store_key("tensor_key", fsd)
     assert model_key.key == "tensor_key"
-    assert model_key.featureStoreDescriptor == fsd
+    assert model_key.descriptor == fsd
 
 
-def test_build_model_key_unsuccessful():
+def test_build_feature_store_key_unsuccessful():
     with pytest.raises(ValueError):
         fsd = "mock-feature-store-descriptor"
-        model_key = handler.build_model_key(100, fsd)
+        model_key = handler.build_feature_store_key(100, fsd)
diff --git a/tests/test_message_handler/test_output_descriptor.py b/tests/test_message_handler/test_output_descriptor.py
index beb9a4765..2b5575965 100644
--- a/tests/test_message_handler/test_output_descriptor.py
+++ b/tests/test_message_handler/test_output_descriptor.py
@@ -34,7 +34,7 @@
 handler = MessageHandler()
 
 fsd = "mock-feature-store-descriptor"
-tensor_key = handler.build_tensor_key("key", fsd)
+tensor_key = handler.build_feature_store_key("key", fsd)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py
index 7ede41b50..751722534 100644
--- a/tests/test_message_handler/test_request.py
+++ b/tests/test_message_handler/test_request.py
@@ -33,14 +33,14 @@
 
 fsd = "mock-feature-store-descriptor"
 
-model_key = MessageHandler.build_model_key("model_key", fsd)
+model_key = MessageHandler.build_feature_store_key("model_key", fsd)
 model = MessageHandler.build_model(b"model data", "model_name", "v0.0.1")
 
-input_key1 = MessageHandler.build_tensor_key("input_key1", fsd)
-input_key2 = MessageHandler.build_tensor_key("input_key2", fsd)
+input_key1 = MessageHandler.build_feature_store_key("input_key1", fsd)
+input_key2 = MessageHandler.build_feature_store_key("input_key2", fsd)
 
-output_key1 = MessageHandler.build_tensor_key("output_key1", fsd)
-output_key2 = MessageHandler.build_tensor_key("output_key2", fsd)
+output_key1 = MessageHandler.build_feature_store_key("output_key1", fsd)
+output_key2 = MessageHandler.build_feature_store_key("output_key2", fsd)
 
 output_descriptor1 = MessageHandler.build_output_tensor_descriptor(
     "c", [output_key1, output_key2], "int64", []
@@ -101,7 +101,7 @@
     "reply_channel, model, input, output, output_descriptors, custom_attributes",
     [
         pytest.param(
-            b"reply channel",
+            "reply channel",
             model_key,
             [input_key1, input_key2],
             [output_key1, output_key2],
@@ -109,7 +109,7 @@
             torch_attributes,
         ),
         pytest.param(
-            b"another reply channel",
+            "another reply channel",
             model,
             [input_key1],
             [output_key2],
@@ -117,7 +117,7 @@
             tf_attributes,
         ),
         pytest.param(
-            b"another reply channel",
+            "another reply channel",
             model,
             [input_key1],
             [output_key2],
@@ -125,7 +125,7 @@
             torch_attributes,
         ),
         pytest.param(
-            b"reply channel",
+            "reply channel",
             model_key,
             [input_key1],
             [output_key1],
@@ -185,7 +185,7 @@ def test_build_request_indirect_successful(
             id="bad channel",
         ),
         pytest.param(
-            b"reply channel",
+            "reply channel",
             "bad model",
             [input_key1],
             [output_key2],
@@ -194,7 +194,7 @@ def test_build_request_indirect_successful(
             id="bad model",
         ),
         pytest.param(
-            b"reply channel",
+            "reply channel",
             model_key,
             ["input_key1", "input_key2"],
             [output_key1, output_key2],
@@ -212,7 +212,7 @@ def test_build_request_indirect_successful(
             id="bad input schema type",
         ),
         pytest.param(
-            b"reply channel",
+            "reply channel",
             model_key,
             [input_key1],
             ["output_key1", "output_key2"],
@@ -230,7 +230,7 @@ def test_build_request_indirect_successful(
             id="bad output schema type",
         ),
         pytest.param(
-            b"reply channel",
+            "reply channel",
             model_key,
             [input_key1],
             [output_key1, output_key2],
@@ -239,7 +239,7 @@ def test_build_request_indirect_successful(
             id="bad custom attributes",
         ),
         pytest.param(
-            b"reply channel",
+            "reply channel",
             model_key,
             [input_key1],
             [output_key1, output_key2],
@@ -248,7 +248,7 @@ def test_build_request_indirect_successful(
             id="bad custom attributes schema type",
         ),
         pytest.param(
-            b"reply channel",
+            "reply channel",
             model_key,
             [input_key1],
             [output_key1, output_key2],
@@ -276,7 +276,7 @@ def test_build_request_indirect_unsuccessful(
     "reply_channel, model, input, output, output_descriptors, custom_attributes",
     [
         pytest.param(
-            b"reply channel",
+            "reply channel",
             model_key,
             [tensor_1, tensor_2],
             [],
@@ -284,7 +284,7 @@ def test_build_request_indirect_unsuccessful(
             torch_attributes,
         ),
         pytest.param(
-            b"another reply channel",
+            "another reply channel",
             model,
             [tensor_1],
             [],
@@ -292,7 +292,7 @@ def test_build_request_indirect_unsuccessful(
             tf_attributes,
         ),
         pytest.param(
-            b"another reply channel",
+            "another reply channel",
             model,
             [tensor_2],
             [],
@@ -300,7 +300,7 @@ def test_build_request_indirect_unsuccessful(
             tf_attributes,
         ),
         pytest.param(
-            b"another reply channel",
+            "another reply channel",
             model,
             [tensor_1],
             [],
diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py
index 86774132e..d0305407c 100644
--- a/tests/test_message_handler/test_response.py
+++ b/tests/test_message_handler/test_response.py
@@ -33,8 +33,8 @@
 
 fsd = "mock-feature-store-descriptor"
 
-result_key1 = MessageHandler.build_tensor_key("result_key1", fsd)
-result_key2 = MessageHandler.build_tensor_key("result_key2", fsd)
+result_key1 = MessageHandler.build_feature_store_key("result_key1", fsd)
+result_key2 = MessageHandler.build_feature_store_key("result_key2", fsd)
 
 torch_attributes = MessageHandler.build_torch_response_attributes()
 tf_attributes = MessageHandler.build_tf_response_attributes()

From 49e0da4bd5ffde73e7cd68a0e7519783de7b740d Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 17 Sep 2024 19:46:30 -0500
Subject: [PATCH 02/40] Revert  featurestorekey test changes

Revert tensor/model key, update tests, enhance logging

Tweak exception naming to follow standard

Test remote queue delay and nowait

Remove large test timeout

Update new tests w/fsk reversion

Modify import order for remote tester
---
 .../standalone_worker_manager.py              |  29 +-
 smartsim/_core/_cli/scripts/dragon_install.py |   9 +-
 .../_core/launcher/dragon/dragonConnector.py  |   8 +-
 .../_core/mli/comm/channel/dragon_channel.py  |  76 +-
 .../control/request_dispatcher.py             |  23 +-
 .../mli/infrastructure/environment_loader.py  |   4 +-
 .../storage/backbone_feature_store.py         |  54 +-
 .../infrastructure/storage/feature_store.py   |   2 +-
 smartsim/protoclient.py                       |  10 +-
 tests/dragon/channel.py                       |  18 +-
 .../test_core_machine_learning_worker.py      |  54 +-
 tests/dragon/test_device_manager.py           |  15 +-
 tests/dragon/test_error_handling.py           |  39 +-
 tests/dragon/test_featurestore.py             |  87 ++-
 tests/dragon/test_featurestore_base.py        |   4 +-
 tests/dragon/test_featurestore_integration.py |   8 +-
 tests/dragon/test_inference_reply.py          |   6 +-
 tests/dragon/test_inference_request.py        |   6 +-
 tests/dragon/test_protoclient.py              | 400 +++++-----
 tests/dragon/test_request_dispatcher.py       | 101 ++-
 tests/dragon/test_torch_worker.py             |   4 +-
 tests/dragon/test_worker_manager.py           |   6 +-
 tests/dragon/utils/channel.py                 |   2 +-
 tests/mli/test_integrated_torch_worker.py     |  24 +-
 tests/test_dragon_installer.py                |  18 +-
 tests/test_dragon_launcher.py                 |  19 +
 tests/test_featurestore.py                    | 711 ------------------
 .../test_build_model_key.py                   |   8 +-
 .../test_output_descriptor.py                 |   2 +-
 tests/test_message_handler/test_request.py    |  10 +-
 tests/test_message_handler/test_response.py   |   4 +-
 31 files changed, 603 insertions(+), 1158 deletions(-)
 delete mode 100644 tests/test_featurestore.py

diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py
index e34df0ccd..1d0b11055 100644
--- a/ex/high_throughput_inference/standalone_worker_manager.py
+++ b/ex/high_throughput_inference/standalone_worker_manager.py
@@ -146,16 +146,17 @@ def service_as_dragon_proc(
 
     to_worker_channel = Channel.make_process_local()
     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-    to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, True)
+    to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, True)
 
-    backbone.worker_queue = to_worker_fli_comm_channel.descriptor
+    backbone.worker_queue = to_worker_fli_comm_ch.descriptor
+
+    os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = to_worker_fli_comm_ch.descriptor
+    os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor
 
     arg_worker_type = cloudpickle.loads(
         base64.b64decode(args.worker_class.encode("ascii"))
     )
 
-    os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor
-
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=DragonCommChannel.from_descriptor,
@@ -173,7 +174,7 @@ def service_as_dragon_proc(
     worker_device = args.device
     for wm_idx in range(args.num_workers):
 
-        worker_manager =  WorkerManager(
+        worker_manager = WorkerManager(
             config_loader=config_loader,
             worker_type=arg_worker_type,
             as_service=True,
@@ -191,21 +192,25 @@ def service_as_dragon_proc(
     # the GPU-to-CPU mapping is taken from the nvidia-smi tool
     # TODO can this be computed on the fly?
     gpu_to_cpu_aff: dict[int, list[int]] = {}
-    gpu_to_cpu_aff[0] = list(range(48,64)) + list(range(112,128))
-    gpu_to_cpu_aff[1] = list(range(32,48)) + list(range(96,112))
-    gpu_to_cpu_aff[2] = list(range(16,32)) + list(range(80,96))
-    gpu_to_cpu_aff[3] = list(range(0,16)) + list(range(64,80))
+    gpu_to_cpu_aff[0] = list(range(48, 64)) + list(range(112, 128))
+    gpu_to_cpu_aff[1] = list(range(32, 48)) + list(range(96, 112))
+    gpu_to_cpu_aff[2] = list(range(16, 32)) + list(range(80, 96))
+    gpu_to_cpu_aff[3] = list(range(0, 16)) + list(range(64, 80))
 
     worker_manager_procs = []
     for worker_idx in range(args.num_workers):
         wm_cpus = len(gpu_to_cpu_aff[worker_idx]) - 4
         wm_affinity = gpu_to_cpu_aff[worker_idx][:wm_cpus]
         disp_affinity.extend(gpu_to_cpu_aff[worker_idx][wm_cpus:])
-        worker_manager_procs.append(service_as_dragon_proc(
+        worker_manager_procs.append(
+            service_as_dragon_proc(
                 worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[worker_idx]
-            ))
+            )
+        )
 
-    dispatcher_proc = service_as_dragon_proc(dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[])
+    dispatcher_proc = service_as_dragon_proc(
+        dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[]
+    )
 
     # TODO: use ProcessGroup and restart=True?
     all_procs = [dispatcher_proc, *worker_manager_procs]
diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index 4fd0be300..662820fed 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -95,13 +95,14 @@ def get_auth_token(request: DragonInstallRequest) -> t.Optional[Token]:
 def create_dotenv(dragon_root_dir: pathlib.Path, dragon_version: str) -> None:
     """Create a .env file with required environment variables for the Dragon runtime"""
     dragon_root = str(dragon_root_dir)
-    dragon_inc_dir = str(dragon_root_dir / "include")
-    dragon_lib_dir = str(dragon_root_dir / "lib")
-    dragon_bin_dir = str(dragon_root_dir / "bin")
+    dragon_rut_dir = dragon_root
+    dragon_inc_dir = dragon_root + "/include"
+    dragon_lib_dir = dragon_root + "/lib"
+    dragon_bin_dir = dragon_root + "/bin"
 
     dragon_vars = {
         "DRAGON_BASE_DIR": dragon_root,
-        "DRAGON_ROOT_DIR": dragon_root,  # note: same as base_dir
+        "DRAGON_ROOT_DIR": dragon_rut_dir,
         "DRAGON_INCLUDE_DIR": dragon_inc_dir,
         "DRAGON_LIB_DIR": dragon_lib_dir,
         "DRAGON_VERSION": dragon_version,
diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py
index 0cd68c24e..9cbc55674 100644
--- a/smartsim/_core/launcher/dragon/dragonConnector.py
+++ b/smartsim/_core/launcher/dragon/dragonConnector.py
@@ -245,9 +245,11 @@ def load_persisted_env(self) -> t.Dict[str, str]:
 
         with open(config.dragon_dotenv, encoding="utf-8") as dot_env:
             for kvp in dot_env.readlines():
-                split = kvp.strip().split("=", maxsplit=1)
-                key, value = split[0], split[-1]
-                self._env_vars[key] = value
+                # skip any commented lines
+                if not kvp.startswith("#"):
+                    split = kvp.strip().split("=", maxsplit=1)
+                    key, value = split[0], split[-1]
+                    self._env_vars[key] = value
 
         return self._env_vars
 
diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py
index a22ebe952..4f8d3e552 100644
--- a/smartsim/_core/mli/comm/channel/dragon_channel.py
+++ b/smartsim/_core/mli/comm/channel/dragon_channel.py
@@ -40,15 +40,44 @@
 
 logger = get_logger(__name__)
 
-import dragon.channels as dch
-
 DEFAULT_CHANNEL_BUFFER_SIZE = 500
 """Maximum number of messages that can be buffered. DragonCommChannel will
 raise an exception if no clients consume messages before the buffer is filled."""
 
+LAST_OFFSET = 0
+"""The last offset used to create a local channel. This is used to avoid
+unnecessary retries when creating a local channel."""
+
+
+def _channel_to_descriptor(channel: dch.Channel) -> str:
+    """Utility method for converting a channel to a descriptor string.
+
+    :param channel: The dragon channel to convert
+    :returns: The descriptor string
+    """
+    if channel is None:
+        raise SmartSimError("Channel is not available to create a descriptor")
+
+    serialized_ch = channel.serialize()
+    return base64.b64encode(serialized_ch).decode("utf-8")
+
+
+def _pool_to_descriptor(pool: dm.MemoryPool) -> str:
+    """Utility method for converting a pool to a descriptor string.
+
+    :param pool: The memory pool to convert
+    :returns: The descriptor string"""
+    if pool is None:
+        raise SmartSimError("Memory pool is not available to create a descriptor")
+
+    serialized_pool = pool.serialize()
+    return base64.b64encode(serialized_pool).decode("utf-8")
+
 
 def create_local(capacity: int = 0) -> dch.Channel:
-    """Creates a Channel attached to the local memory pool.
+    """Creates a Channel attached to the local memory pool. Replacement for
+    direct calls to `dch.Channel.make_process_local()` to enable
+    supplying a channel capacity.
 
     :param capacity: The number of events the channel can buffer; uses the default
     buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied
@@ -56,9 +85,14 @@ def create_local(capacity: int = 0) -> dch.Channel:
     :raises SmartSimError: If unable to attach local channel
     """
     pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd))
+    pool_descriptor = _pool_to_descriptor(pool)
     channel: t.Optional[dch.Channel] = None
     offset = 0
 
+    global LAST_OFFSET
+    if LAST_OFFSET:
+        offset = LAST_OFFSET
+
     capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE
 
     while not channel:
@@ -66,18 +100,18 @@ def create_local(capacity: int = 0) -> dch.Channel:
         offset += 1
         cid = df.BASE_USER_MANAGED_CUID + offset
         try:
-            channel = dch.Channel(
-                mem_pool=pool,
-                c_uid=cid,
-                capacity=capacity,
-            )
+            channel = dch.Channel(mem_pool=pool, c_uid=cid, capacity=capacity)
+            LAST_OFFSET = offset
+            descriptor = _channel_to_descriptor(channel)
             logger.debug(
-                f"Channel {cid} created in pool {pool.serialize()} w/capacity {capacity}"
+                "Local channel creatd: "
+                f"{cid=}, {pool_descriptor=}, {capacity=}, {descriptor=}"
             )
-        except Exception as e:
+        except dch.ChannelError as e:
             if offset < 100:
-                logger.warning(f"Unable to attach to channel id {cid}. Retrying...")
+                logger.warning(f"Channnel id {cid} is not open. Retrying...")
             else:
+                LAST_OFFSET = 0
                 logger.error(f"All attempts to attach local channel have failed")
                 raise SmartSimError("Failed to attach local channel") from e
 
@@ -92,8 +126,7 @@ def __init__(self, channel: "dch.Channel") -> None:
 
         :param channel: A channel to use for communications
         """
-        serialized_ch = channel.serialize()
-        descriptor = base64.b64encode(serialized_ch).decode("utf-8")
+        descriptor = _channel_to_descriptor(channel)
         super().__init__(descriptor)
         self._channel = channel
 
@@ -115,7 +148,7 @@ def send(self, value: bytes, timeout: float = 0.001) -> None:
         try:
             with self._channel.sendh(timeout=timeout) as sendh:
                 sendh.send_bytes(value)
-                logger.debug(f"DragonCommChannel {self.descriptor!r} sent message")
+                logger.debug(f"DragonCommChannel {self.descriptor} sent message")
         except Exception as e:
             raise SmartSimError(
                 f"Error sending message: DragonCommChannel {self.descriptor!r}"
@@ -130,8 +163,6 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]:
         with self._channel.recvh(timeout=timeout) as recvh:
             messages: t.List[bytes] = []
 
-            # todo: consider that this could (under load) never exit. do we need
-            # to configure a maximum number to pull at once?
             try:
                 message_bytes = recvh.recv_bytes(timeout=timeout)
                 messages.append(message_bytes)
@@ -139,7 +170,7 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]:
             except dch.ChannelEmpty:
                 # emptied the queue, ok to swallow this ex
                 logger.debug(f"DragonCommChannel exhausted: {self.descriptor}")
-            except dch.ChannelRecvTimeout as ex:
+            except dch.ChannelRecvTimeout:
                 logger.debug(f"Timeout exceeded on channel.recv: {self.descriptor}")
 
             return messages
@@ -164,7 +195,7 @@ def descriptor_string(self) -> str:
     @classmethod
     def from_descriptor(
         cls,
-        descriptor: t.Union[bytes, str],
+        descriptor: str,
     ) -> "DragonCommChannel":
         """A factory method that creates an instance from a descriptor string.
 
@@ -173,6 +204,9 @@ def from_descriptor(
         :returns: An attached DragonCommChannel
         :raises SmartSimError: If creation of comm channel fails"""
         try:
+            if isinstance(descriptor, bytes):
+                raise ValueError("Descriptor must be a string")
+
             utf8_descriptor: t.Union[str, bytes] = descriptor
             if isinstance(descriptor, str):
                 utf8_descriptor = descriptor.encode("utf-8")
@@ -183,10 +217,10 @@ def from_descriptor(
             actual_descriptor = base64.b64decode(utf8_descriptor)
             channel = dch.Channel.attach(actual_descriptor)
             return DragonCommChannel(channel)
-        except Exception as ex:
+        except Exception as e:
             raise SmartSimError(
-                f"Failed to create dragon comm channel: {descriptor!r}"
-            ) from ex
+                f"Failed to create dragon comm channel: {descriptor}"
+            ) from e
 
     @classmethod
     def from_local(cls, _descriptor: t.Optional[str] = None) -> "DragonCommChannel":
diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
index 67797fe44..d14755f53 100644
--- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
@@ -142,13 +142,22 @@ def ready(self) -> bool:
         :returns: True if the queue can be flushed, False otherwise
         """
         if self.empty():
+            logger.debug("Request dispatcher queue is empty")
             return False
 
-        timed_out = (
-            self._batch_timeout > 0 and self._elapsed_time >= self._batch_timeout
-        )
-        logger.debug(f"Is full: {self.full()} or has timed out: {timed_out}")
-        return self.full() or timed_out
+        timed_out = False
+        if self._batch_timeout > 0:
+            timed_out = self._elapsed_time >= self._batch_timeout
+
+        if self.full():
+            logger.debug("Request dispatcher ready to deliver full batch")
+            return True
+
+        if timed_out:
+            logger.debug("Request dispatcher delivering partial batch")
+            return True
+
+        return False
 
     def make_disposable(self) -> None:
         """Set this queue as disposable, and never use it again after it gets
@@ -281,7 +290,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool:
         fs_missing = fs_desired - fs_actual
 
         if not self.has_featurestore_factory:
-            logger.error("No feature store factory configured")
+            logger.error("No feature store factory is configured. Unable to dispatch.")
             return False
 
         # create the feature stores we need to service request
@@ -463,7 +472,7 @@ def dispatch(self, request: InferenceRequest) -> None:
             )
             self._active_queues[tmp_id] = tmp_queue
             self._queues[tmp_id] = [tmp_queue]
-            tmp_queue.put_nowait(request)
+            tmp_queue.put(request)
             tmp_queue.make_disposable()
             return
 
diff --git a/smartsim/_core/mli/infrastructure/environment_loader.py b/smartsim/_core/mli/infrastructure/environment_loader.py
index e67cc469a..2c89184d8 100644
--- a/smartsim/_core/mli/infrastructure/environment_loader.py
+++ b/smartsim/_core/mli/infrastructure/environment_loader.py
@@ -83,7 +83,9 @@ def get_backbone(self) -> t.Optional[FeatureStore]:
             return None
 
         if self._featurestore_factory is None:
-            logger.warning("No feature store factory is configured")
+            logger.warning(
+                "No feature store factory is configured. Backbone not created."
+            )
             return None
 
         self.backbone = self._featurestore_factory(descriptor)
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index 0db41f77a..9cc8a6bf9 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -27,6 +27,7 @@
 import base64
 import enum
 import itertools
+import os
 import pickle
 import time
 import typing as t
@@ -67,9 +68,10 @@ class BackboneFeatureStore(DragonFeatureStore):
 
     MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS"
     MLI_BACKEND_CONSUMER = "_SMARTIM_MLI_BACKEND_CONSUMER"
-    MLI_WORKER_QUEUE = "to_worker_fli"
+    MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE"
     MLI_BACKBONE = "_SMARTSIM_INFRA_BACKBONE"
     _CREATED_ON = "creation"
+    _DEFAULT_WAIT_TIMEOUT = 30.0
 
     def __init__(
         self,
@@ -86,7 +88,7 @@ def __init__(
         self._enable_reserved_writes = allow_reserved_writes
 
         if self._CREATED_ON not in self:
-            self._record_creation_date()
+            self._record_creation_data()
 
     @property
     def wait_timeout(self) -> float:
@@ -154,7 +156,7 @@ def creation_date(self) -> str:
         """Return the creation date for the backbone feature store"""
         return str(self[self._CREATED_ON])
 
-    def _record_creation_date(self) -> None:
+    def _record_creation_data(self) -> None:
         """Write the creation timestamp to the feature store"""
         if self._CREATED_ON not in self:
             if not self._allow_reserved_writes:
@@ -163,6 +165,9 @@ def _record_creation_date(self) -> None:
                 )
             self[self._CREATED_ON] = str(time.time())
 
+        if os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) is None:
+            os.environ.update(self.get_env())
+
     @classmethod
     def from_writable_descriptor(
         cls,
@@ -181,9 +186,8 @@ def from_writable_descriptor(
                 f"Error creating dragon feature store: {descriptor}"
             ) from ex
 
-    @staticmethod
     def _check_wait_timeout(
-        start_time: float, timeout: float, indicators: t.Dict[str, bool]
+        self, start_time: float, timeout: float, indicators: t.Dict[str, bool]
     ) -> None:
         """Perform timeout verification
 
@@ -193,11 +197,11 @@ def _check_wait_timeout(
         elapsed = time.time() - start_time
         if timeout and elapsed > timeout:
             raise SmartSimError(
-                f"Timeout retrieving all keys from backbone: {indicators}"
+                f"Backbone {self.descriptor=} timeout retrieving all keys: {indicators}"
             )
 
     def wait_for(
-        self, keys: t.List[str], timeout: float = 0
+        self, keys: t.List[str], timeout: float = _DEFAULT_WAIT_TIMEOUT
     ) -> t.Dict[str, t.Union[str, bytes, None]]:
         """Perform a blocking wait until all specified keys have been found
         in the backbone
@@ -205,39 +209,39 @@ def wait_for(
         :param keys: The required collection of keys to retrieve
         :param timeout: The maximum wait time in seconds. Overrides class level setting
         """
+        if timeout < 0:
+            timeout = self._DEFAULT_WAIT_TIMEOUT
+            logger.info(f"Using default wait_for timeout: {timeout}s")
+
+        if not keys:
+            return {}
 
-        to_check = list(keys)
-        was_found = [False for _ in to_check]  # add test ensuring dupes are handled..
-        values: t.List[t.Union[str, bytes, None]] = [None for _ in to_check]
+        values: t.Dict[str, t.Union[str, bytes, None]] = {k: None for k in set(keys)}
+        is_found = {k: False for k in values.keys()}
 
-        backoff: t.List[float] = [0.1, 0.5, 1, 2, 4, 8]
+        backoff: t.List[float] = [0.1, 0.5, 1, 2, 4]
         backoff_iter = itertools.cycle(backoff)
         start_time = time.time()
 
-        while not all(was_found):
+        while not all(is_found.values()):
             delay = next(backoff_iter)
 
-            for index, key in enumerate(to_check):
-                if was_found[index]:
-                    continue
-
+            for key in [k for k, v in is_found.items() if not v]:
                 try:
-                    values[index] = self[key]
-                    was_found[index] = True
-                except KeyError:
+                    values[key] = self[key]
+                    is_found[key] = True
+                except Exception:
                     if delay == backoff[-1]:
                         logger.debug(f"Re-attempting `{key}` retrieval in {delay}s")
 
-            if all(was_found):
+            if all(is_found.values()):
+                logger.debug(f"wait_for({keys}) retrieved all keys")
                 continue
 
-            self._check_wait_timeout(
-                start_time, timeout, dict(zip(to_check, was_found))
-            )
-
+            self._check_wait_timeout(start_time, timeout, is_found)
             time.sleep(delay)
 
-        return dict(zip(keys, values))
+        return values
 
     def get_env(self) -> t.Dict[str, str]:
         """Returns a dictionary populated with environment variables necessary to
diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py
index ac6cdaf31..8c85a352d 100644
--- a/smartsim/_core/mli/infrastructure/storage/feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py
@@ -47,7 +47,7 @@ class ReservedKeys(str, enum.Enum):
     """Storage location for the channel used to send messages directly to
     the MLI backend"""
 
-    MLI_WORKER_QUEUE = "to_worker_fli"  # todo: ensure this adheres to standard
+    MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE"
     """Storage location for the channel used to send work requests 
     to the available worker managers"""
 
diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py
index bf195a756..b0e235f8c 100644
--- a/smartsim/protoclient.py
+++ b/smartsim/protoclient.py
@@ -68,6 +68,8 @@
 
 
 class ProtoClient:
+    _DEFAULT_TIMEOUT = 30.0
+
     @staticmethod
     def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore:
         """Use the supplied environment variables to attach
@@ -92,7 +94,9 @@ def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore:
     def _attach_to_worker_queue(self) -> DragonFLIChannel:
         """Wait until the backbone contains the worker queue configuration,
         then attach an FLI to the given worker queue"""
-        configuration = self._backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE])
+        configuration = self._backbone.wait_for(
+            [BackboneFeatureStore.MLI_WORKER_QUEUE], self._timeout
+        )
         # descriptor = configuration.get(BackboneFeatureStore.MLI_WORKER_QUEUE, None)
         # NOTE: without wait_for, this MUST be in the backbone....
         # descriptor = self._backbone.worker_queue
@@ -130,14 +134,14 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None:
 
         :param timing_on: Flag indicating if timing information should be
         written to file
-        :param wait_timeout: Maximum wait time allowed to attach to the
+        :param wait_timeout: Maximum wait time (in seconds) allowed to attach to the
         worker queue
 
         :raises: SmartSimError if unable to attach to a backbone featurestore"""
         # comm = MPI.COMM_WORLD
         # rank = comm.Get_rank()
         rank: int = 0
-        self._queue_timeout = wait_timeout
+        self._timeout = wait_timeout or self._DEFAULT_TIMEOUT
 
         connect_to_infrastructure()
         # ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"]
diff --git a/tests/dragon/channel.py b/tests/dragon/channel.py
index 234878423..b00ba9aa2 100644
--- a/tests/dragon/channel.py
+++ b/tests/dragon/channel.py
@@ -39,17 +39,14 @@
 class FileSystemCommChannel(CommChannelBase):
     """Passes messages by writing to a file"""
 
-    def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
+    def __init__(self, key: pathlib.Path) -> None:
         """Initialize the FileSystemCommChannel instance
 
         :param key: a path to the root directory of the feature store"""
         self._lock = threading.RLock()
-        if isinstance(key, pathlib.Path):
-            super().__init__(key.as_posix().encode("utf-8"))
-            self._file_path = key
-        else:
-            super().__init__(key)
-            self._file_path = pathlib.Path(key.decode("utf-8"))
+
+        super().__init__(key.as_posix())
+        self._file_path = key
 
         if not self._file_path.parent.exists():
             self._file_path.parent.mkdir(parents=True)
@@ -110,17 +107,14 @@ def clear(self) -> None:
     @classmethod
     def from_descriptor(
         cls,
-        descriptor: t.Union[str, bytes],
+        descriptor: str,
     ) -> "FileSystemCommChannel":
         """A factory method that creates an instance from a descriptor string
 
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached FileSystemCommChannel"""
         try:
-            if isinstance(descriptor, str):
-                path = pathlib.Path(descriptor)
-            else:
-                path = pathlib.Path(descriptor.decode("utf-8"))
+            path = pathlib.Path(descriptor)
             return FileSystemCommChannel(path)
         except:
             logger.warning(f"failed to create fs comm channel: {descriptor}")
diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py
index ed9ac625c..e9c356b4e 100644
--- a/tests/dragon/test_core_machine_learning_worker.py
+++ b/tests/dragon/test_core_machine_learning_worker.py
@@ -34,7 +34,7 @@
 import torch
 
 import smartsim.error as sse
-from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey
+from smartsim._core.mli.infrastructure.storage.feature_store import ModelKey, TensorKey
 from smartsim._core.mli.infrastructure.worker.worker import (
     InferenceRequest,
     MachineLearningWorkerCore,
@@ -98,7 +98,7 @@ def test_fetch_model_disk(persist_torch_model: pathlib.Path, test_dir: str) -> N
     fsd = feature_store.descriptor
     feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes()
 
-    model_key = FeatureStoreKey(key=key, descriptor=fsd)
+    model_key = ModelKey(key=key, descriptor=fsd)
     request = InferenceRequest(model_key=model_key)
     batch = RequestBatch([request], None, model_key)
 
@@ -116,7 +116,7 @@ def test_fetch_model_disk_missing() -> None:
 
     key = "/path/that/doesnt/exist"
 
-    model_key = FeatureStoreKey(key=key, descriptor=fsd)
+    model_key = ModelKey(key=key, descriptor=fsd)
     request = InferenceRequest(model_key=model_key)
     batch = RequestBatch([request], None, model_key)
 
@@ -141,7 +141,7 @@ def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None:
     fsd = feature_store.descriptor
     feature_store[key] = persist_torch_model.read_bytes()
 
-    model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
+    model_key = ModelKey(key=key, descriptor=feature_store.descriptor)
     request = InferenceRequest(model_key=model_key)
     batch = RequestBatch([request], None, model_key)
 
@@ -159,7 +159,7 @@ def test_fetch_model_feature_store_missing() -> None:
     feature_store = MemoryFeatureStore()
     fsd = feature_store.descriptor
 
-    model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
+    model_key = ModelKey(key=key, descriptor=feature_store.descriptor)
     request = InferenceRequest(model_key=model_key)
     batch = RequestBatch([request], None, model_key)
 
@@ -182,7 +182,7 @@ def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None:
     fsd = feature_store.descriptor
     feature_store[key] = persist_torch_model.read_bytes()
 
-    model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
+    model_key = ModelKey(key=key, descriptor=feature_store.descriptor)
     request = InferenceRequest(model_key=model_key)
     batch = RequestBatch([request], None, model_key)
 
@@ -199,11 +199,9 @@ def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None:
 
     feature_store = MemoryFeatureStore()
     fsd = feature_store.descriptor
-    request = InferenceRequest(
-        input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)]
-    )
+    request = InferenceRequest(input_keys=[TensorKey(key=tensor_name, descriptor=fsd)])
 
-    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    model_key = ModelKey(key="test-model", descriptor=fsd)
     batch = RequestBatch([request], None, model_key)
 
     worker = MachineLearningWorkerCore
@@ -223,9 +221,9 @@ def test_fetch_input_disk_missing() -> None:
     fsd = feature_store.descriptor
     key = "/path/that/doesnt/exist"
 
-    request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
+    request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)])
 
-    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    model_key = ModelKey(key="test-model", descriptor=fsd)
     batch = RequestBatch([request], None, model_key)
 
     with pytest.raises(sse.SmartSimError) as ex:
@@ -245,14 +243,12 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None:
     feature_store = MemoryFeatureStore()
     fsd = feature_store.descriptor
 
-    request = InferenceRequest(
-        input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)]
-    )
+    request = InferenceRequest(input_keys=[TensorKey(key=tensor_name, descriptor=fsd)])
 
     # put model bytes into the feature store
     feature_store[tensor_name] = persist_torch_tensor.read_bytes()
 
-    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    model_key = ModelKey(key="test-model", descriptor=fsd)
     batch = RequestBatch([request], None, model_key)
 
     fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
@@ -284,13 +280,13 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) ->
 
     request = InferenceRequest(
         input_keys=[
-            FeatureStoreKey(key=tensor_name + "1", descriptor=fsd),
-            FeatureStoreKey(key=tensor_name + "2", descriptor=fsd),
-            FeatureStoreKey(key=tensor_name + "3", descriptor=fsd),
+            TensorKey(key=tensor_name + "1", descriptor=fsd),
+            TensorKey(key=tensor_name + "2", descriptor=fsd),
+            TensorKey(key=tensor_name + "3", descriptor=fsd),
         ]
     )
 
-    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    model_key = ModelKey(key="test-model", descriptor=fsd)
     batch = RequestBatch([request], None, model_key)
 
     fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
@@ -310,9 +306,9 @@ def test_fetch_input_feature_store_missing() -> None:
     key = "bad-key"
     feature_store = MemoryFeatureStore()
     fsd = feature_store.descriptor
-    request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
+    request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)])
 
-    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    model_key = ModelKey(key="test-model", descriptor=fsd)
     batch = RequestBatch([request], None, model_key)
 
     with pytest.raises(sse.SmartSimError) as ex:
@@ -332,9 +328,9 @@ def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None:
 
     key = "test-model"
     feature_store[key] = persist_torch_tensor.read_bytes()
-    request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
+    request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)])
 
-    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    model_key = ModelKey(key="test-model", descriptor=fsd)
     batch = RequestBatch([request], None, model_key)
 
     fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
@@ -351,9 +347,9 @@ def test_place_outputs() -> None:
 
     # create a key to retrieve from the feature store
     keys = [
-        FeatureStoreKey(key=key_name + "1", descriptor=fsd),
-        FeatureStoreKey(key=key_name + "2", descriptor=fsd),
-        FeatureStoreKey(key=key_name + "3", descriptor=fsd),
+        TensorKey(key=key_name + "1", descriptor=fsd),
+        TensorKey(key=key_name + "2", descriptor=fsd),
+        TensorKey(key=key_name + "3", descriptor=fsd),
     ]
     data = [b"abcdef", b"ghijkl", b"mnopqr"]
 
@@ -376,6 +372,6 @@ def test_place_outputs() -> None:
         pytest.param("key", "", id="invalid descriptor"),
     ],
 )
-def test_invalid_featurestorekey(key, descriptor) -> None:
+def test_invalid_tensorkey(key, descriptor) -> None:
     with pytest.raises(ValueError):
-        fsk = FeatureStoreKey(key, descriptor)
+        fsk = TensorKey(key, descriptor)
diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py
index c58879cb6..d270e921c 100644
--- a/tests/dragon/test_device_manager.py
+++ b/tests/dragon/test_device_manager.py
@@ -36,7 +36,8 @@
 )
 from smartsim._core.mli.infrastructure.storage.feature_store import (
     FeatureStore,
-    FeatureStoreKey,
+    ModelKey,
+    TensorKey,
 )
 from smartsim._core.mli.infrastructure.worker.worker import (
     ExecuteResult,
@@ -116,9 +117,9 @@ def test_device_manager_model_in_request():
 
     worker = MockWorker()
 
-    tensor_key = FeatureStoreKey(key="key", descriptor="desc")
-    output_key = FeatureStoreKey(key="key", descriptor="desc")
-    model_key = FeatureStoreKey(key="model key", descriptor="desc")
+    tensor_key = TensorKey(key="key", descriptor="desc")
+    output_key = TensorKey(key="key", descriptor="desc")
+    model_key = ModelKey(key="model key", descriptor="desc")
 
     request = InferenceRequest(
         model_key=model_key,
@@ -154,9 +155,9 @@ def test_device_manager_model_key():
 
     worker = MockWorker()
 
-    tensor_key = FeatureStoreKey(key="key", descriptor="desc")
-    output_key = FeatureStoreKey(key="key", descriptor="desc")
-    model_key = FeatureStoreKey(key="model key", descriptor="desc")
+    tensor_key = TensorKey(key="key", descriptor="desc")
+    output_key = TensorKey(key="key", descriptor="desc")
+    model_key = ModelKey(key="model key", descriptor="desc")
 
     request = InferenceRequest(
         model_key=model_key,
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 0f3e38f93..b0934b6f5 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -55,7 +55,8 @@
 )
 from smartsim._core.mli.infrastructure.storage.feature_store import (
     FeatureStore,
-    FeatureStoreKey,
+    ModelKey,
+    TensorKey,
 )
 from smartsim._core.mli.infrastructure.worker.worker import (
     ExecuteResult,
@@ -127,12 +128,8 @@ def setup_worker_manager_model_bytes(
         cooldown=3,
     )
 
-    tensor_key = MessageHandler.build_feature_store_key(
-        "key", app_feature_store.descriptor
-    )
-    output_key = MessageHandler.build_feature_store_key(
-        "key", app_feature_store.descriptor
-    )
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
 
     inf_request = InferenceRequest(
         model_key=None,
@@ -145,7 +142,7 @@ def setup_worker_manager_model_bytes(
         batch_size=0,
     )
 
-    model_id = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+    model_id = ModelKey(key="key", descriptor=app_feature_store.descriptor)
 
     request_batch = RequestBatch(
         [inf_request],
@@ -190,9 +187,9 @@ def setup_worker_manager_model_key(
         cooldown=3,
     )
 
-    tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
-    output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
-    model_id = FeatureStoreKey(key="model key", descriptor=app_feature_store.descriptor)
+    tensor_key = TensorKey(key="key", descriptor=app_feature_store.descriptor)
+    output_key = TensorKey(key="key", descriptor=app_feature_store.descriptor)
+    model_id = ModelKey(key="model key", descriptor=app_feature_store.descriptor)
 
     request = InferenceRequest(
         model_key=model_id,
@@ -245,12 +242,8 @@ def setup_request_dispatcher_model_bytes(
     )
     request_dispatcher._on_start()
 
-    tensor_key = MessageHandler.build_feature_store_key(
-        "key", app_feature_store.descriptor
-    )
-    output_key = MessageHandler.build_feature_store_key(
-        "key", app_feature_store.descriptor
-    )
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
     model = MessageHandler.build_model(b"model", "model name", "v 0.0.1")
     request = MessageHandler.build_request(
         test_dir, model, [tensor_key], [output_key], [], None
@@ -293,14 +286,10 @@ def setup_request_dispatcher_model_key(
     )
     request_dispatcher._on_start()
 
-    tensor_key = MessageHandler.build_feature_store_key(
-        "key", app_feature_store.descriptor
-    )
-    output_key = MessageHandler.build_feature_store_key(
-        "key", app_feature_store.descriptor
-    )
-    model_key = MessageHandler.build_feature_store_key(
-        key="model key", feature_store_descriptor=app_feature_store.descriptor
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    model_key = MessageHandler.build_model_key(
+        key="model key", descriptor=app_feature_store.descriptor
     )
     request = MessageHandler.build_request(
         test_dir, model_key, [tensor_key], [output_key], [], None
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index a2c8118ac..7f1649741 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -66,7 +66,6 @@
 
 # The tests in this file must run in a dragon environment
 pytestmark = pytest.mark.dragon
-WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
 
 
 @pytest.fixture
@@ -83,7 +82,9 @@ def storage_for_dragon_fs_with_req_queue(
     fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
     comm_channel = DragonFLIChannel(fli_, True)
 
-    storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor
+    storage_for_dragon_fs[BackboneFeatureStore.MLI_WORKER_QUEUE] = (
+        comm_channel.descriptor
+    )
     return storage_for_dragon_fs
 
 
@@ -97,7 +98,7 @@ def storage_for_dragon_fs_with_mock_req_queue(
     # comm_channel = DragonFLIChannel(fli_, True)
 
     mock_descriptor = "12345"
-    storage_for_dragon_fs[WORK_QUEUE_KEY] = mock_descriptor
+    storage_for_dragon_fs[BackboneFeatureStore.MLI_WORKER_QUEUE] = mock_descriptor
     return storage_for_dragon_fs
 
 
@@ -192,6 +193,31 @@ def test_eventconsumer_eventpublisher_integration(
     assert len(app_messages) == 4
 
 
+def test_backbone_wait_for_no_keys(
+    storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that asking the backbone to wait for a value succeeds
+    immediately and does not cause a wait to occur if the supplied key
+    list is empty
+
+    :param storage_for_dragon_fs: the storage engine to use, prepopulated with
+    """
+    # set a very low timeout to confirm that it does not wait
+    storage = storage_for_dragon_fs_with_req_queue
+
+    backbone = BackboneFeatureStore(storage)
+
+    with monkeypatch.context() as ctx:
+        # all keys should be found and the timeout should never be checked.
+        ctx.setattr(bbtime, "sleep", mock.MagicMock())
+
+        values = backbone.wait_for([])
+        assert len(values) == 0
+
+        # confirm that no wait occurred
+        bbtime.sleep.assert_not_called()
+
+
 def test_backbone_wait_for_prepopulated(
     storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch
 ) -> None:
@@ -201,8 +227,6 @@ def test_backbone_wait_for_prepopulated(
     :param storage_for_dragon_fs: the storage engine to use, prepopulated with
     """
     # set a very low timeout to confirm that it does not wait
-    wait_timeout = 0.1
-    # storage = {WORK_QUEUE_KEY: "123456"}
     storage = storage_for_dragon_fs_with_req_queue
 
     backbone = BackboneFeatureStore(storage)
@@ -211,18 +235,50 @@ def test_backbone_wait_for_prepopulated(
         # all keys should be found and the timeout should never be checked.
         ctx.setattr(bbtime, "sleep", mock.MagicMock())
 
-        values = backbone.wait_for([WORK_QUEUE_KEY])
+        values = backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE])
 
         # confirm that wait_for with one key returns one value
         assert len(values) == 1
 
         # confirm that the descriptor is non-null w/some non-trivial value
-        assert len(values[WORK_QUEUE_KEY]) > 5
+        assert len(values[BackboneFeatureStore.MLI_WORKER_QUEUE]) > 5
 
         # confirm that no wait occurred
         bbtime.sleep.assert_not_called()
 
 
+def test_backbone_wait_for_prepopulated_dupe(
+    storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that asking the backbone to wait for keys that are duplicated
+    results in a single value being returned for each key
+
+    :param storage_for_dragon_fs: the storage engine to use, prepopulated with
+    """
+    # set a very low timeout to confirm that it does not wait
+    storage = storage_for_dragon_fs_with_req_queue
+
+    backbone = BackboneFeatureStore(storage)
+    key1, key2 = "key-1", "key-2"
+    value1, value2 = "i-am-value-1", "i-am-value-2"
+    backbone[key1] = value1
+    backbone[key2] = value2
+
+    with monkeypatch.context() as ctx:
+        # all keys should be found and the timeout should never be checked.
+        ctx.setattr(bbtime, "sleep", mock.MagicMock())
+
+        values = backbone.wait_for([key1, key2, key1])  # key1 is duplicated
+
+        # confirm that wait_for with one key returns one value
+        assert len(values) == 2
+        assert key1 in values
+        assert key2 in values
+
+        assert values[key1] == value1
+        assert values[key2] == value2
+
+
 def set_value_after_delay(
     descriptor: str, key: str, value: str, delay: float = 5
 ) -> None:
@@ -238,6 +294,7 @@ def set_value_after_delay(
     logger.debug(f"set_value_after_delay wrote `{value} to backbone[`{key}`]")
 
 
+@pytest.mark.skip(reason="Using mp on build agent is not working correctly")
 @pytest.mark.parametrize("delay", [0, 1, 2, 4, 8])
 def test_backbone_wait_for_partial_prepopulated(
     storage_for_dragon_fs_with_mock_req_queue: t.Any, delay: float
@@ -264,7 +321,7 @@ def test_backbone_wait_for_partial_prepopulated(
 
     p2 = mp.Process(
         target=backbone.wait_for,
-        args=([WORK_QUEUE_KEY, key],),
+        args=([BackboneFeatureStore.MLI_WORKER_QUEUE, key],),
         kwargs={"timeout": wait_timeout},
     )
     p2.start()
@@ -273,21 +330,25 @@ def test_backbone_wait_for_partial_prepopulated(
     p2.join()
 
     # both values should be written at this time
-    ret_vals = backbone.wait_for([WORK_QUEUE_KEY, key], 0.1)
+    ret_vals = backbone.wait_for([key, BackboneFeatureStore.MLI_WORKER_QUEUE, key], 0.1)
     # confirm that wait_for with two keys returns two values
     assert len(ret_vals) == 2, "values should contain values for both awaited keys"
 
     # confirm the pre-populated value has the correct output
-    assert ret_vals[WORK_QUEUE_KEY] == "12345"  # mock descriptor value from fixture
+    assert (
+        ret_vals[BackboneFeatureStore.MLI_WORKER_QUEUE] == "12345"
+    )  # mock descriptor value from fixture
 
     # confirm the population process completed and the awaited value is correct
     assert ret_vals[key] == value, "verify order of values "
 
 
+@pytest.mark.skip(reason="Using mp on build agent is not working correctly")
 @pytest.mark.parametrize("num_keys", [0, 1, 3, 7, 11])
 def test_backbone_wait_for_multikey(
     storage_for_dragon_fs_with_req_queue: t.Any,
     num_keys: int,
+    test_dir: str,
 ) -> None:
     """Verify that asking the backbone to wait for multiple keys results
     in that number of values being returned
@@ -317,7 +378,7 @@ def test_backbone_wait_for_multikey(
 
     p2 = mp.Process(
         target=backbone.wait_for,
-        args=([[*extra_keys]],),
+        args=(extra_keys,),
         kwargs={"timeout": max_delay * 2},
     )
     p2.start()
@@ -328,7 +389,9 @@ def test_backbone_wait_for_multikey(
     )  # give it 10 seconds longer than p2 timeout for backoff
 
     # use without a wait to verify all values are written
-    actual_values = backbone.wait_for([*extra_keys], timeout=0.01)
+    num_keys = len(extra_keys)
+    actual_values = backbone.wait_for(extra_keys, timeout=0.01)
+    assert len(extra_keys) == num_keys
 
     # confirm that wait_for returns all the expected values
     assert len(actual_values) == num_keys
diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py
index bb5dccad7..94733afc7 100644
--- a/tests/dragon/test_featurestore_base.py
+++ b/tests/dragon/test_featurestore_base.py
@@ -767,7 +767,9 @@ def test_backbone_wait_timeout(wait_timeout: float, exp_wait_max: float) -> None
     backbone = BackboneFeatureStore(storage)
 
     with pytest.raises(SmartSimError) as ex:
-        backbone.wait_for(["does-not-exist"])
+        backbone.wait_for(["does-not-exist"], wait_timeout)
+
+    assert "timeout" in str(ex.value.args[0]).lower()
 
     end_time = time.time()
     elapsed = end_time - start_time
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index 104acd914..b088df5b4 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -77,7 +77,6 @@ def test_eventconsumer_eventpublisher_integration(
 
     mock_storage = storage_for_dragon_fs
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-    mock_fs_descriptor = backbone.descriptor
 
     # verify ability to write and read from ddict
     backbone["test_dir"] = test_dir
@@ -220,7 +219,12 @@ def test_eventconsumer_max_dequeue(
         pytest.param(0, id="use default: 500"),
         pytest.param(1, id="non-zero buffer size: 1"),
         pytest.param(500, id="buffer size: 500"),
-        pytest.param(1000, id="buffer size: 1000"),
+        pytest.param(800, id="buffer size: 800"),
+        pytest.param(
+            1000,
+            id="buffer size: 1000, unreliable in dragon-v0.10",
+            marks=pytest.mark.skip,
+        ),
     ],
 )
 def test_channel_buffer_size(
diff --git a/tests/dragon/test_inference_reply.py b/tests/dragon/test_inference_reply.py
index 1eb137ae6..bdc7be14b 100644
--- a/tests/dragon/test_inference_reply.py
+++ b/tests/dragon/test_inference_reply.py
@@ -28,7 +28,7 @@
 
 dragon = pytest.importorskip("dragon")
 
-from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey
+from smartsim._core.mli.infrastructure.storage.feature_store import TensorKey
 from smartsim._core.mli.infrastructure.worker.worker import InferenceReply
 from smartsim._core.mli.message_handler import MessageHandler
 
@@ -44,8 +44,8 @@ def inference_reply() -> InferenceReply:
 
 
 @pytest.fixture
-def fs_key() -> FeatureStoreKey:
-    return FeatureStoreKey("key", "descriptor")
+def fs_key() -> TensorKey:
+    return TensorKey("key", "descriptor")
 
 
 @pytest.mark.parametrize(
diff --git a/tests/dragon/test_inference_request.py b/tests/dragon/test_inference_request.py
index 909d021d6..f5c8b9bdc 100644
--- a/tests/dragon/test_inference_request.py
+++ b/tests/dragon/test_inference_request.py
@@ -28,7 +28,7 @@
 
 dragon = pytest.importorskip("dragon")
 
-from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey
+from smartsim._core.mli.infrastructure.storage.feature_store import TensorKey
 from smartsim._core.mli.infrastructure.worker.worker import InferenceRequest
 from smartsim._core.mli.message_handler import MessageHandler
 
@@ -44,8 +44,8 @@ def inference_request() -> InferenceRequest:
 
 
 @pytest.fixture
-def fs_key() -> FeatureStoreKey:
-    return FeatureStoreKey("key", "descriptor")
+def fs_key() -> TensorKey:
+    return TensorKey("key", "descriptor")
 
 
 @pytest.mark.parametrize(
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index 590780154..3eb800bb7 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -1,231 +1,231 @@
-# BSD 2-Clause License
-#
-# Copyright (c) 2021-2024, Hewlett Packard Enterprise
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import pickle
-import time
-import typing as t
-
-import pytest
-
-dragon = pytest.importorskip("dragon")
-
-from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local
-from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
-    BackboneFeatureStore,
-    EventBroadcaster,
-    OnWriteFeatureStore,
-)
-from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict
-from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys
-from smartsim.error.errors import SmartSimError
-from smartsim.log import get_logger
-
-# isort: off
-from dragon import fli
-from dragon.channels import Channel
-
-# from ..ex..high_throughput_inference.mock_app import ProtoClient
-from smartsim.protoclient import ProtoClient
-
-
-# The tests in this file belong to the dragon group
-pytestmark = pytest.mark.dragon
-WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
-logger = get_logger(__name__)
-
-
-@pytest.fixture
-def storage_for_dragon_fs() -> t.Dict[str, str]:
-    # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
-    return dragon_ddict.DDict(1, 2, 4 * 1024**2)
-
-
-@pytest.fixture
-def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
-    return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
-
-
-@pytest.fixture
-def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
-    """a stand-in for the worker manager so a worker queue exists"""
-
-    # create the FLI
-    to_worker_channel = Channel.make_process_local()
-    # to_worker_channel = create_local()
-    fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-    comm_channel = DragonFLIChannel(fli_, True)
-
-    # store the descriptor in the backbone
-    # the_backbone.worker_queue = comm_channel.descriptor
-    the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor
-
-    try:
-        comm_channel.send(b"foo")
-    except Exception as ex:
-        print(f"ohnooooo: {ex}")
-
-    return comm_channel
-
-
-@pytest.fixture
-def storage_for_dragon_fs_with_req_queue(
-    storage_for_dragon_fs: t.Dict[str, str]
-) -> t.Dict[str, str]:
-    # create a valid FLI so any call to attach does not fail
-    channel_ = Channel.make_process_local()
-    fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
-    comm_channel = DragonFLIChannel(fli_, True)
-
-    storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor
-    return storage_for_dragon_fs
-
-
-@pytest.mark.parametrize(
-    "wait_timeout, exp_wait_max",
-    [
-        # aggregate the 1+1+1 into 3 on remaining parameters
-        pytest.param(1, 1 + 1 + 1, id="1s wait, 3 cycle steps"),
-        pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"),
-        pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"),
-    ],
-)
-def test_protoclient_timeout(
-    wait_timeout: float,
-    exp_wait_max: float,
-    the_backbone: BackboneFeatureStore,
-    monkeypatch: pytest.MonkeyPatch,
-):
-    """Verify that attempts to attach to the worker queue from the protoclient
-    timeout in an appropriate amount of time. Note: due to the backoff, we verify
-    the elapsed time is less than the 15s of a cycle of waits
+# # BSD 2-Clause License
+# #
+# # Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# # All rights reserved.
+# #
+# # Redistribution and use in source and binary forms, with or without
+# # modification, are permitted provided that the following conditions are met:
+# #
+# # 1. Redistributions of source code must retain the above copyright notice, this
+# #    list of conditions and the following disclaimer.
+# #
+# # 2. Redistributions in binary form must reproduce the above copyright notice,
+# #    this list of conditions and the following disclaimer in the documentation
+# #    and/or other materials provided with the distribution.
+# #
+# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# import pickle
+# import time
+# import typing as t
+
+# import pytest
+
+# dragon = pytest.importorskip("dragon")
+
+# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local
+# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+#     BackboneFeatureStore,
+#     EventBroadcaster,
+#     OnWriteFeatureStore,
+# )
+# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict
+# from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys
+# from smartsim.error.errors import SmartSimError
+# from smartsim.log import get_logger
+
+# # isort: off
+# from dragon import fli
+# from dragon.channels import Channel
+
+# # from ..ex..high_throughput_inference.mock_app import ProtoClient
+# from smartsim.protoclient import ProtoClient
+
+
+# # The tests in this file belong to the dragon group
+# pytestmark = pytest.mark.dragon
+# WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
+# logger = get_logger(__name__)
+
+
+# @pytest.fixture
+# def storage_for_dragon_fs() -> t.Dict[str, str]:
+#     # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
+#     return dragon_ddict.DDict(1, 2, 4 * 1024**2)
+
+
+# @pytest.fixture
+# def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
+#     return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
+
+
+# @pytest.fixture
+# def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
+#     """a stand-in for the worker manager so a worker queue exists"""
+
+#     # create the FLI
+#     to_worker_channel = Channel.make_process_local()
+#     # to_worker_channel = create_local()
+#     fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+#     comm_channel = DragonFLIChannel(fli_, True)
+
+#     # store the descriptor in the backbone
+#     # the_backbone.worker_queue = comm_channel.descriptor
+#     the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor
+
+#     try:
+#         comm_channel.send(b"foo")
+#     except Exception as ex:
+#         print(f"ohnooooo: {ex}")
+
+#     return comm_channel
+
+
+# @pytest.fixture
+# def storage_for_dragon_fs_with_req_queue(
+#     storage_for_dragon_fs: t.Dict[str, str]
+# ) -> t.Dict[str, str]:
+#     # create a valid FLI so any call to attach does not fail
+#     channel_ = Channel.make_process_local()
+#     fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
+#     comm_channel = DragonFLIChannel(fli_, True)
+
+#     storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor
+#     return storage_for_dragon_fs
+
+
+# @pytest.mark.parametrize(
+#     "wait_timeout, exp_wait_max",
+#     [
+#         # aggregate the 1+1+1 into 3 on remaining parameters
+#         pytest.param(0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps"),
+#         pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"),
+#         pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"),
+#     ],
+# )
+# def test_protoclient_timeout(
+#     wait_timeout: float,
+#     exp_wait_max: float,
+#     the_backbone: BackboneFeatureStore,
+#     monkeypatch: pytest.MonkeyPatch,
+# ):
+#     """Verify that attempts to attach to the worker queue from the protoclient
+#     timeout in an appropriate amount of time. Note: due to the backoff, we verify
+#     the elapsed time is less than the 15s of a cycle of waits
 
-    :param wait_timeout: a timeout for use when configuring a proto client
-    :param exp_wait_max: a ceiling for the expected time spent waiting for
-    the timeout
-    :param the_backbone: a pre-initialized backbone featurestore for setting up
-    the environment variable required by the client"""
+#     :param wait_timeout: a timeout for use when configuring a proto client
+#     :param exp_wait_max: a ceiling for the expected time spent waiting for
+#     the timeout
+#     :param the_backbone: a pre-initialized backbone featurestore for setting up
+#     the environment variable required by the client"""
 
-    # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8]
-    # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
-    start_time = time.time()
-    with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex:
-        ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
+#     # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8]
+#     # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
+#     start_time = time.time()
+#     with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex:
+#         ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
 
-        ProtoClient(False, wait_timeout=wait_timeout)
+#         ProtoClient(False, wait_timeout=wait_timeout)
 
-    end_time = time.time()
-    elapsed = end_time - start_time
+#     end_time = time.time()
+#     elapsed = end_time - start_time
 
-    # todo: revisit. should this trigger any wait if the backbone is set above?
-    # confirm that we met our timeout
-    # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}"
+#     # todo: revisit. should this trigger any wait if the backbone is set above?
+#     # confirm that we met our timeout
+#     # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}"
 
-    # confirm that the total wait time is aligned with the sleep cycle
-    assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}"
+#     # confirm that the total wait time is aligned with the sleep cycle
+#     assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}"
 
 
-def test_protoclient_initialization_no_backbone():
-    """Verify that attempting to start the client without required environment variables
-    results in an exception. NOTE: Backbone env var is not set"""
+# def test_protoclient_initialization_no_backbone():
+#     """Verify that attempting to start the client without required environment variables
+#     results in an exception. NOTE: Backbone env var is not set"""
 
-    with pytest.raises(SmartSimError) as ex:
-        ProtoClient(timing_on=False)
+#     with pytest.raises(SmartSimError) as ex:
+#         ProtoClient(timing_on=False)
 
-    # confirm the missing value error has been raised
-    assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" ")))
+#     # confirm the missing value error has been raised
+#     assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" ")))
 
 
-def test_protoclient_initialization(
-    the_backbone: BackboneFeatureStore,
-    the_worker_queue: DragonFLIChannel,
-    monkeypatch: pytest.MonkeyPatch,
-):
-    """Verify that attempting to start the client with required env vars results
-    in a fully initialized client
+# def test_protoclient_initialization(
+#     the_backbone: BackboneFeatureStore,
+#     the_worker_queue: DragonFLIChannel,
+#     monkeypatch: pytest.MonkeyPatch,
+# ):
+#     """Verify that attempting to start the client with required env vars results
+#     in a fully initialized client
 
-    :param the_backbone: a pre-initialized backbone featurestore
-    :param the_worker_queue: an FLI channel the client will retrieve
-    from the backbone"""
+#     :param the_backbone: a pre-initialized backbone featurestore
+#     :param the_worker_queue: an FLI channel the client will retrieve
+#     from the backbone"""
 
-    with monkeypatch.context() as ctx:
-        ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
-        # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture
+#     with monkeypatch.context() as ctx:
+#         ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
+#         # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture
 
-        client = ProtoClient(timing_on=False)
+#         client = ProtoClient(timing_on=False)
 
-        # confirm the backbone was attached correctly
-        assert client._backbone is not None
-        assert client._backbone.descriptor == the_backbone.descriptor
+#         # confirm the backbone was attached correctly
+#         assert client._backbone is not None
+#         assert client._backbone.descriptor == the_backbone.descriptor
 
-        # confirm the worker queue is created and attached correctly
-        assert client._to_worker_fli is not None
-        assert client._to_worker_fli.descriptor == the_worker_queue.descriptor
+#         # confirm the worker queue is created and attached correctly
+#         assert client._to_worker_fli is not None
+#         assert client._to_worker_fli.descriptor == the_worker_queue.descriptor
 
-        # confirm the worker channels are created
-        assert client._from_worker_ch is not None
-        assert client._from_worker_ch.descriptor
+#         # confirm the worker channels are created
+#         assert client._from_worker_ch is not None
+#         assert client._from_worker_ch.descriptor
 
-        assert client._to_worker_ch is not None
-        assert client._to_worker_ch.descriptor
+#         assert client._to_worker_ch is not None
+#         assert client._to_worker_ch.descriptor
 
-        # confirm a publisher is created
-        assert client._publisher is not None
+#         # confirm a publisher is created
+#         assert client._publisher is not None
 
 
-def test_protoclient_write_model(
-    the_backbone: BackboneFeatureStore,
-    the_worker_queue: DragonFLIChannel,
-    monkeypatch: pytest.MonkeyPatch,
-):
-    """Verify that writing a model using the client causes the model data to be
-    written to a feature store and triggers a key-written event
+# def test_protoclient_write_model(
+#     the_backbone: BackboneFeatureStore,
+#     the_worker_queue: DragonFLIChannel,
+#     monkeypatch: pytest.MonkeyPatch,
+# ):
+#     """Verify that writing a model using the client causes the model data to be
+#     written to a feature store and triggers a key-written event
 
-    :param the_backbone: a pre-initialized backbone featurestore
-    :param the_worker_queue: an FLI channel the client will retrieve
-    from the backbone"""
+#     :param the_backbone: a pre-initialized backbone featurestore
+#     :param the_worker_queue: an FLI channel the client will retrieve
+#     from the backbone"""
 
-    with monkeypatch.context() as ctx:
-        ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
-        # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture
+#     with monkeypatch.context() as ctx:
+#         ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
+#         # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture
 
-        client = ProtoClient(timing_on=False)
+#         client = ProtoClient(timing_on=False)
 
-        model_key = "my-model"
-        model_bytes = b"12345"
+#         model_key = "my-model"
+#         model_bytes = b"12345"
 
-        client.set_model(model_key, model_bytes)
+#         client.set_model(model_key, model_bytes)
 
-        # confirm the client modified the underlying feature store
-        assert client._backbone[model_key] == model_bytes
+#         # confirm the client modified the underlying feature store
+#         assert client._backbone[model_key] == model_bytes
 
-        publisher = t.cast(EventBroadcaster, client._publisher)
+#         publisher = t.cast(EventBroadcaster, client._publisher)
 
-        # confirm the client raised the key-written event
-        assert len(publisher._event_buffer) == 1
+#         # confirm the client raised the key-written event
+#         assert len(publisher._event_buffer) == 1
 
-        event = t.cast(OnWriteFeatureStore, pickle.loads(publisher._event_buffer.pop()))
-        assert event.descriptor == the_backbone.descriptor
-        assert event.key == model_key
+#         event = t.cast(OnWriteFeatureStore, pickle.loads(publisher._event_buffer.pop()))
+#         assert event.descriptor == the_backbone.descriptor
+#         assert event.key == model_key
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index 714492f37..e666710e6 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -27,6 +27,7 @@
 import gc
 import io
 import logging
+import os
 import pathlib
 import socket
 import time
@@ -36,18 +37,16 @@
 import numpy as np
 import pytest
 
-torch = pytest.importorskip("torch")
-dragon = pytest.importorskip("dragon")
+pytest.importorskip("torch")
+pytest.importorskip("dragon")
 
-import base64
-import multiprocessing as mp
 
-try:
-    mp.set_start_method("dragon")
-except Exception:
-    pass
+# isort: off
+import dragon
+import multiprocessing as mp
+import torch
 
-import os
+# isort: on
 
 import dragon.channels as dch
 import dragon.infrastructure.policy as dragon_policy
@@ -55,14 +54,14 @@
 import dragon.native.process as dragon_process
 import torch.nn as nn
 from dragon import fli
-from dragon.channels import Channel
 from dragon.data.ddict.ddict import DDict
-from dragon.managed_memory import MemoryAlloc, MemoryPool
-from dragon.mpbridge.queues import DragonQueue
+from dragon.managed_memory import MemoryAlloc
 
 from smartsim._core.entrypoints.service import Service
-from smartsim._core.mli.comm.channel.channel import CommChannelBase
-from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_channel import (
+    DragonCommChannel,
+    create_local,
+)
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.control.request_dispatcher import (
     RequestBatch,
@@ -71,6 +70,9 @@
 from smartsim._core.mli.infrastructure.control.worker_manager import (
     EnvironmentConfigLoader,
 )
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
@@ -79,13 +81,12 @@
 from smartsim._core.mli.message_handler import MessageHandler
 from smartsim.log import get_logger
 
-from .feature_store import FileSystemFeatureStore
-from .utils.channel import FileSystemCommChannel
-
 logger = get_logger(__name__)
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
 
+mp.set_start_method("dragon")
+
 
 class MiniModel(nn.Module):
     def __init__(self):
@@ -136,14 +137,18 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
 def mock_messages(
     request_dispatcher_queue: DragonFLIChannel,
     feature_store: FeatureStore,
+    parent_iteration: int,
+    callback_descriptor: str,
 ) -> None:
     """Mock event producer for triggering the inference pipeline"""
     model_key = "mini-model"
+    # mock_message sends 2 messages, so we offset by 2 * (# of iterations in caller)
+    offset = 2 * parent_iteration
 
     for iteration_number in range(2):
+        logged_iteration = offset + iteration_number
+        logger.debug(f"Sending mock message {logged_iteration}")
 
-        channel = Channel.make_process_local()
-        callback_channel = DragonCommChannel(channel)
         output_key = f"output-{iteration_number}"
 
         feature_store[model_key] = load_model()
@@ -157,25 +162,35 @@ def mock_messages(
             "c", "float32", list(tensor.shape)
         )
 
-        message_tensor_output_key = MessageHandler.build_feature_store_key(
-            output_key, fsd
-        )
-        message_model_key = MessageHandler.build_feature_store_key(model_key, fsd)
+        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
+        message_model_key = MessageHandler.build_model_key(model_key, fsd)
 
         request = MessageHandler.build_request(
-            reply_channel=callback_channel.descriptor,
+            reply_channel=callback_descriptor,
             model=message_model_key,
             inputs=[tensor_desc],
             outputs=[message_tensor_output_key],
             output_descriptors=[],
             custom_attributes=None,
         )
+
+        logger.info(f"Sending request {iteration_number} to request_dispatcher_queue")
         request_bytes = MessageHandler.serialize_request(request)
         with request_dispatcher_queue._fli.sendh(
             timeout=None, stream_channel=request_dispatcher_queue._channel
         ) as sendh:
             sendh.send_bytes(request_bytes)
             sendh.send_bytes(tensor.tobytes())
+
+        logger.info(
+            f"Retrieving {iteration_number} from callback channel: {callback_descriptor}"
+        )
+        callback_channel = DragonCommChannel.from_descriptor(callback_descriptor)
+
+        # Results will be empty. The test pulls messages off the queue before they
+        # can be serviced by a worker. Just ensure the callback channel works.
+        results = callback_channel.recv(timeout=0.1)
+        logger.debug(f"Received mock message results on callback channel: {results}")
         time.sleep(1)
 
 
@@ -216,16 +231,17 @@ def test_request_dispatcher() -> None:
     longer referenced by the dispatcher.
     """
 
-    to_worker_channel = dch.Channel.make_process_local()
+    to_worker_channel = create_local()
     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
     to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, sender_supplied=True)
 
+    ddict = DDict(1, 2, 4 * 1024**2)
+    backbone_fs = BackboneFeatureStore(ddict, allow_reserved_writes=True)
+
     # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
     # or test environment may be unable to send messages w/queue
-    os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_ch.descriptor
-
-    ddict = DDict(1, 2, 4 * 1024**2)
-    dragon_fs = DragonFeatureStore(ddict)
+    os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = to_worker_fli_comm_ch.descriptor
+    os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone_fs.descriptor
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -243,46 +259,49 @@ def test_request_dispatcher() -> None:
 
     worker_queue = config_loader.get_queue()
     if worker_queue is None:
-        logger.warn(
+        logger.warning(
             "FLI input queue not loaded correctly from config_loader: "
             f"{config_loader._queue_descriptor}"
         )
 
     request_dispatcher._on_start()
 
-    for _ in range(2):
+    for i in range(2):
         batch: t.Optional[RequestBatch] = None
         mem_allocs = []
         tensors = []
-        model_key = "mini-model"
+
+        # NOTE: creating callbacks in test to avoid a local channel being torn
+        # down when mock_messages terms but before the final response message is sent
+
+        callback_channel = DragonCommChannel.from_local()
 
         # create a mock client application to populate the request queue
         msg_pump = mp.Process(
             target=mock_messages,
-            args=(
-                worker_queue,
-                dragon_fs,
-            ),
+            args=(worker_queue, backbone_fs, i, callback_channel.descriptor),
         )
 
         msg_pump.start()
 
         time.sleep(1)
 
-        for _ in range(15):
+        for _ in range(200):
             try:
                 request_dispatcher._on_iteration()
-                batch = request_dispatcher.task_queue.get(timeout=1)
+                batch = request_dispatcher.task_queue.get(timeout=0.1)
                 break
             except Empty:
                 continue
             except Exception as exc:
                 raise exc
 
-        try:
-            assert batch is not None
-            assert batch.has_valid_requests
+        assert batch is not None
+        assert batch.has_valid_requests
 
+        model_key = batch.model_id.key
+
+        try:
             transform_result = batch.inputs
             for transformed, dims, dtype in zip(
                 transform_result.transformed,
diff --git a/tests/dragon/test_torch_worker.py b/tests/dragon/test_torch_worker.py
index 9a5ed6309..2a9e7d01b 100644
--- a/tests/dragon/test_torch_worker.py
+++ b/tests/dragon/test_torch_worker.py
@@ -37,7 +37,7 @@
 from torch import nn
 from torch.nn import functional as F
 
-from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey
+from smartsim._core.mli.infrastructure.storage.feature_store import ModelKey
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim._core.mli.infrastructure.worker.worker import (
     ExecuteResult,
@@ -109,7 +109,7 @@ def get_request() -> InferenceRequest:
     ]
 
     return InferenceRequest(
-        model_key=FeatureStoreKey(key="model", descriptor="xyz"),
+        model_key=ModelKey(key="model", descriptor="xyz"),
         callback=None,
         raw_inputs=tensor_numpy,
         input_keys=None,
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index 43b8cc7ec..b2ddb3481 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -146,7 +146,7 @@
 #     model_bytes = load_model()
 #     backbone[model_key] = model_bytes
 
-#     message_model_key = MessageHandler.build_feature_store_key(
+#     message_model_key = MessageHandler.build_model_key(
 #         model_key, backbone.descriptor
 #     )
 
@@ -183,10 +183,10 @@
 
 #         fsd = backbone.descriptor
 
-#         # message_tensor_output_key = MessageHandler.build_feature_store_key(
+#         # message_tensor_output_key = MessageHandler.build_tensor_key(
 #         #     output_key, fsd
 #         # )
-#         # message_tensor_input_key = MessageHandler.build_feature_store_key(
+#         # message_tensor_input_key = MessageHandler.build_tensor_key(
 #         #     input_key, fsd
 #         # )
 
diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py
index 09e1703bc..b00ba9aa2 100644
--- a/tests/dragon/utils/channel.py
+++ b/tests/dragon/utils/channel.py
@@ -117,5 +117,5 @@ def from_descriptor(
             path = pathlib.Path(descriptor)
             return FileSystemCommChannel(path)
         except:
-            logger.warning(f"failed to create fs comm channel: {descriptor!r}")
+            logger.warning(f"failed to create fs comm channel: {descriptor}")
             raise
diff --git a/tests/mli/test_integrated_torch_worker.py b/tests/mli/test_integrated_torch_worker.py
index 67a9a4a9b..60f1f0c6b 100644
--- a/tests/mli/test_integrated_torch_worker.py
+++ b/tests/mli/test_integrated_torch_worker.py
@@ -106,9 +106,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path:
 
 #     output_key = f"demo-output"
 
-#     message_tensor_output_key = MessageHandler.build_feature_store_key(output_key)
-#     message_tensor_input_key = MessageHandler.build_feature_store_key(input_key)
-#     message_model_key = MessageHandler.build_feature_store_key(model_key)
+#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     message_model_key = MessageHandler.build_model_key(model_key)
 
 #     request = MessageHandler.build_request(
 #         reply_channel=callback_channel.descriptor,
@@ -146,9 +146,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path:
 
 #     output_key = f"demo-output"
 
-#     message_tensor_output_key = MessageHandler.build_feature_store_key(output_key)
-#     message_tensor_input_key = MessageHandler.build_feature_store_key(input_key)
-#     # message_model_key = MessageHandler.build_feature_store_key(model_key)
+#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     # message_model_key = MessageHandler.build_model_key(model_key)
 
 #     request = MessageHandler.build_request(
 #         reply_channel=callback_channel.descriptor,
@@ -187,9 +187,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path:
 
 #     output_key = f"demo-output"
 
-#     message_tensor_output_key = MessageHandler.build_feature_store_key(output_key)
-#     # message_tensor_input_key = MessageHandler.build_feature_store_key(input_key)
-#     # message_model_key = MessageHandler.build_feature_store_key(model_key)
+#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     # message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     # message_model_key = MessageHandler.build_model_key(model_key)
 #     message_tensor_input = MessageHandler.build_tensor(
 #         input_tensor, "c", "float32", [2]
 #     )
@@ -231,9 +231,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path:
 
 #     output_key = f"demo-output"
 
-#     # message_tensor_output_key = MessageHandler.build_feature_store_key(output_key)
-#     # message_tensor_input_key = MessageHandler.build_feature_store_key(input_key)
-#     message_model_key = MessageHandler.build_feature_store_key(model_key)
+#     # message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     # message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     message_model_key = MessageHandler.build_model_key(model_key)
 #     message_tensor_input = MessageHandler.build_tensor(
 #         input_tensor, "c", "float32", [2]
 #     )
diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py
index 7b678239a..b1d8cd34c 100644
--- a/tests/test_dragon_installer.py
+++ b/tests/test_dragon_installer.py
@@ -511,10 +511,18 @@ def test_create_dotenv_existing_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir
 
         # ensure file was overwritten and env vars are not duplicated
         dotenv_content = exp_env_path.read_text(encoding="utf-8")
-        split_content = dotenv_content.split(var_name)
-
-        # split to confirm env var only appars once
-        assert len(split_content) == 2
+        lines = [
+            line for line in dotenv_content.split("\n") if line and not "#" in line
+        ]
+        for line in lines:
+            if line.startswith(var_name):
+                # make sure the var isn't defined recursively
+                # DRAGON_BASE_DIR=$DRAGON_BASE_DIR
+                assert var_name not in line[len(var_name) + 1 :]
+            else:
+                # make sure any values reference the original base dir var
+                if var_name in line:
+                    assert f"${var_name}" in line
 
 
 def test_create_dotenv_format(monkeypatch: pytest.MonkeyPatch, test_dir: str):
@@ -532,7 +540,7 @@ def test_create_dotenv_format(monkeypatch: pytest.MonkeyPatch, test_dir: str):
         content = exp_env_path.read_text(encoding="utf-8")
 
         # ensure we have values written, but ignore empty lines
-        lines = [line for line in content.split("\n") if line]
+        lines = [line for line in content.split("\n") if line and not "#" in line]
         assert lines
 
         # ensure each line is formatted as key=value
diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py
index 37c46a573..ea45a2cb7 100644
--- a/tests/test_dragon_launcher.py
+++ b/tests/test_dragon_launcher.py
@@ -510,7 +510,26 @@ def test_load_env_env_file_created(monkeypatch: pytest.MonkeyPatch, test_dir: st
         assert loaded_env
 
         # confirm .env was parsed as expected by inspecting a key
+        assert "DRAGON_BASE_DIR" in loaded_env
+        base_dir = loaded_env["DRAGON_BASE_DIR"]
+
         assert "DRAGON_ROOT_DIR" in loaded_env
+        assert loaded_env["DRAGON_ROOT_DIR"] == base_dir
+
+        assert "DRAGON_INCLUDE_DIR" in loaded_env
+        assert loaded_env["DRAGON_INCLUDE_DIR"] == f"{base_dir}/include"
+
+        assert "DRAGON_LIB_DIR" in loaded_env
+        assert loaded_env["DRAGON_LIB_DIR"] == f"{base_dir}/lib"
+
+        assert "DRAGON_VERSION" in loaded_env
+        assert loaded_env["DRAGON_VERSION"] == DEFAULT_DRAGON_VERSION
+
+        assert "PATH" in loaded_env
+        assert loaded_env["PATH"] == f"{base_dir}/bin"
+
+        assert "LD_LIBRARY_PATH" in loaded_env
+        assert loaded_env["LD_LIBRARY_PATH"] == f"{base_dir}/lib"
 
 
 def test_load_env_cached_env(monkeypatch: pytest.MonkeyPatch, test_dir: str):
diff --git a/tests/test_featurestore.py b/tests/test_featurestore.py
deleted file mode 100644
index f0b122bcf..000000000
--- a/tests/test_featurestore.py
+++ /dev/null
@@ -1,711 +0,0 @@
-# # BSD 2-Clause License
-# #
-# # Copyright (c) 2021-2024, Hewlett Packard Enterprise
-# # All rights reserved.
-# #
-# # Redistribution and use in source and binary forms, with or without
-# # modification, are permitted provided that the following conditions are met:
-# #
-# # 1. Redistributions of source code must retain the above copyright notice, this
-# #    list of conditions and the following disclaimer.
-# #
-# # 2. Redistributions in binary form must reproduce the above copyright notice,
-# #    this list of conditions and the following disclaimer in the documentation
-# #    and/or other materials provided with the distribution.
-# #
-# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# import pathlib
-# import time
-# import typing as t
-# import unittest.mock as mock
-
-# import pytest
-
-# dragon = pytest.importorskip("dragon")
-
-# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
-#     BackboneFeatureStore,
-#     EventBroadcaster,
-#     EventCategory,
-#     EventConsumer,
-#     OnCreateConsumer,
-#     OnWriteFeatureStore,
-# )
-# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
-#     time as bbtime,
-# )
-# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
-#     DragonFeatureStore,
-# )
-# from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys
-# from smartsim.error import SmartSimError
-# from tests.mli.channel import FileSystemCommChannel
-# from tests.mli.feature_store import MemoryFeatureStore
-
-# if t.TYPE_CHECKING:
-#     import conftest
-
-
-# # The tests in this file belong to the group_a group
-# pytestmark = pytest.mark.group_a
-
-# WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
-# RANDOMLY_SET_KEY = "_SOMETHING_ELSE"
-
-
-# @pytest.fixture
-# def storage_for_dragon_fs_with_req_queue() -> t.Dict[str, str]:
-#     storage = {WORK_QUEUE_KEY: "12345", RANDOMLY_SET_KEY: "67890"}
-#     return storage
-
-
-# def boom(*args, **kwargs) -> None:
-#     """Helper function that blows up when used to mock up
-#     some other function"""
-#     raise Exception(f"you shall not pass! {args}, {kwargs}")
-
-
-# def test_event_uid() -> None:
-#     """Verify that all events include a unique identifier"""
-#     uids: t.Set[str] = set()
-#     num_iters = 1000
-
-#     # generate a bunch of events and keep track all the IDs
-#     for i in range(num_iters):
-#         event_a = OnCreateConsumer(str(i), [])
-#         event_b = OnWriteFeatureStore(str(i), "key")
-
-#         uids.add(event_a.uid)
-#         uids.add(event_b.uid)
-
-#     # verify each event created a unique ID
-#     assert len(uids) == 2 * num_iters
-
-
-# def test_mli_reserved_keys_conversion() -> None:
-#     """Verify that conversion from a string to an enum member
-#     works as expected"""
-
-#     for reserved_key in ReservedKeys:
-#         # iterate through all keys and verify `from_string` works
-#         assert ReservedKeys.contains(reserved_key.value)
-
-#         # show that the value (actual key) not the enum member name
-#         # will not be incorrectly identified as reserved
-#         assert not ReservedKeys.contains(str(reserved_key).split(".")[1])
-
-
-# def test_mli_reserved_keys_writes() -> None:
-#     """Verify that attempts to write to reserved keys are blocked from a
-#     standard DragonFeatureStore but enabled with the BackboneFeatureStore"""
-
-#     mock_storage = {}
-#     dfs = DragonFeatureStore(mock_storage)
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     other = MemoryFeatureStore(mock_storage)
-
-#     expected_value = "value"
-
-#     for reserved_key in ReservedKeys:
-#         # we expect every reserved key to fail using DragonFeatureStore...
-#         with pytest.raises(SmartSimError) as ex:
-#             dfs[reserved_key] = expected_value
-
-#         assert "reserved key" in ex.value.args[0]
-
-#         # ... and expect other feature stores to respect reserved keys
-#         with pytest.raises(SmartSimError) as ex:
-#             other[reserved_key] = expected_value
-
-#         assert "reserved key" in ex.value.args[0]
-
-#         # ...and those same keys to succeed on the backbone
-#         backbone[reserved_key] = expected_value
-#         actual_value = backbone[reserved_key]
-#         assert actual_value == expected_value
-
-
-# def test_mli_consumers_read_by_key() -> None:
-#     """Verify that the value returned from the mli consumers
-#     method is written to the correct key and reads are
-#     allowed via standard dragon feature store.
-#     NOTE: should reserved reads also be blocked"""
-
-#     mock_storage = {}
-#     dfs = DragonFeatureStore(mock_storage)
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     other = MemoryFeatureStore(mock_storage)
-
-#     expected_value = "value"
-
-#     # write using backbone that has permission to write reserved keys
-#     backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value
-
-#     # confirm read-only access to reserved keys from any FeatureStore
-#     for fs in [dfs, backbone, other]:
-#         assert fs[ReservedKeys.MLI_NOTIFY_CONSUMERS] == expected_value
-
-
-# def test_mli_consumers_read_by_backbone() -> None:
-#     """Verify that the backbone reads the correct location
-#     when using the backbone feature store API instead of mapping API"""
-
-#     mock_storage = {}
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     expected_value = "value"
-
-#     backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value
-
-#     # confirm reading via convenience method returns expected value
-#     assert backbone.notification_channels[0] == expected_value
-
-
-# def test_mli_consumers_write_by_backbone() -> None:
-#     """Verify that the backbone writes the correct location
-#     when using the backbone feature store API instead of mapping API"""
-
-#     mock_storage = {}
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     expected_value = ["value"]
-
-#     backbone.notification_channels = expected_value
-
-#     # confirm write using convenience method targets expected key
-#     assert backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] == ",".join(expected_value)
-
-
-# def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None:
-#     """Verify that a broadcast operation without any registered subscribers
-#     succeeds without raising Exceptions
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     mock_storage = {}
-#     consumer_descriptor = storage_path / "test-consumer"
-
-#     # NOTE: we're not putting any consumers into the backbone here!
-#     backbone = BackboneFeatureStore(mock_storage)
-
-#     event = OnCreateConsumer(consumer_descriptor, [])
-
-#     publisher = EventBroadcaster(backbone)
-#     num_receivers = 0
-
-#     # publishing this event without any known consumers registered should succeed
-#     # but report that it didn't have anybody to send the event to
-#     consumer_descriptor = storage_path / f"test-consumer"
-#     event = OnCreateConsumer(consumer_descriptor, [])
-
-#     num_receivers += publisher.send(event)
-
-#     # confirm no changes to the backbone occur when fetching the empty consumer key
-#     key_in_features_store = ReservedKeys.MLI_NOTIFY_CONSUMERS in backbone
-#     assert not key_in_features_store
-
-#     # confirm that the broadcast reports no events published
-#     assert num_receivers == 0
-#     # confirm that the broadcast buffered the event for a later send
-#     assert publisher.num_buffered == 1
-
-
-# def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None:
-#     """Verify that a broadcast operation without any registered subscribers
-#     succeeds without raising Exceptions
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     mock_storage = {}
-
-#     # note: file-system descriptors are just paths
-#     consumer_descriptor = storage_path / "test-consumer"
-
-#     # prep our backbone with a consumer list
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     backbone.notification_channels = []
-
-#     event = OnCreateConsumer(consumer_descriptor, [])
-#     publisher = EventBroadcaster(
-#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
-#     )
-#     num_receivers = publisher.send(event)
-
-#     registered_consumers = backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS]
-
-#     # confirm that no consumers exist in backbone to send to
-#     assert not registered_consumers
-#     # confirm that the broadcast reports no events published
-#     assert num_receivers == 0
-#     # confirm that the broadcast buffered the event for a later send
-#     assert publisher.num_buffered == 1
-
-
-# def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None:
-#     """Verify that a broadcast operation reports an error if no channel
-#     factory was supplied for constructing the consumer channels
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     mock_storage = {}
-
-#     # note: file-system descriptors are just paths
-#     consumer_descriptor = storage_path / "test-consumer"
-
-#     # prep our backbone with a consumer list
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     backbone.notification_channels = [consumer_descriptor]
-
-#     event = OnCreateConsumer(consumer_descriptor, [])
-#     publisher = EventBroadcaster(
-#         backbone,
-#         # channel_factory=FileSystemCommChannel.from_descriptor # <--- not supplied
-#     )
-
-#     with pytest.raises(SmartSimError) as ex:
-#         publisher.send(event)
-
-#     assert "factory" in ex.value.args[0]
-
-
-# def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None:
-#     """Verify that a successful broadcast clears messages from the event
-#     buffer when a new message is sent and consumers are registered
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     mock_storage = {}
-
-#     # note: file-system descriptors are just paths
-#     consumer_descriptor = storage_path / "test-consumer"
-
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     backbone.notification_channels = (consumer_descriptor,)
-
-#     publisher = EventBroadcaster(
-#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
-#     )
-
-#     # mock building up some buffered events
-#     num_buffered_events = 14
-#     for i in range(num_buffered_events):
-#         event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", [])
-#         publisher._event_buffer.append(bytes(event))
-
-#     event0 = OnCreateConsumer(
-#         storage_path / f"test-consumer-{str(num_buffered_events + 1)}", []
-#     )
-
-#     num_receivers = publisher.send(event0)
-#     # 1 receiver x 15 total events == 15 events
-#     assert num_receivers == num_buffered_events + 1
-
-
-# @pytest.mark.parametrize(
-#     "num_consumers, num_buffered, expected_num_sent",
-#     [
-#         pytest.param(0, 7, 0, id="0 x (7+1) - no consumers, multi-buffer"),
-#         pytest.param(1, 7, 8, id="1 x (7+1) - single consumer, multi-buffer"),
-#         pytest.param(2, 7, 16, id="2 x (7+1) - multi-consumer, multi-buffer"),
-#         pytest.param(4, 4, 20, id="4 x (4+1) - multi-consumer, multi-buffer (odd #)"),
-#         pytest.param(9, 0, 9, id="13 x (0+1) - multi-consumer, empty buffer"),
-#     ],
-# )
-# def test_eventpublisher_broadcast_returns_total_sent(
-#     test_dir: str, num_consumers: int, num_buffered: int, expected_num_sent: int
-# ) -> None:
-#     """Verify that a successful broadcast returns the total number of events
-#     sent, including buffered messages.
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs
-#     :param num_consumers: the number of consumers to mock setting up prior to send
-#     :param num_buffered: the number of pre-buffered events to mock up
-#     :param expected_num_sent: the expected result from calling send
-#     """
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     mock_storage = {}
-
-#     # note: file-system descriptors are just paths
-#     consumers = []
-#     for i in range(num_consumers):
-#         consumers.append(storage_path / f"test-consumer-{i}")
-
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     backbone.notification_channels = consumers
-
-#     publisher = EventBroadcaster(
-#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
-#     )
-
-#     # mock building up some buffered events
-#     for i in range(num_buffered):
-#         event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", [])
-#         publisher._event_buffer.append(bytes(event))
-
-#     assert publisher.num_buffered == num_buffered
-
-#     # this event will trigger clearing anything already in buffer
-#     event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}", [])
-
-#     # num_receivers should contain a number that computes w/all consumers and all events
-#     num_receivers = publisher.send(event0)
-
-#     assert num_receivers == expected_num_sent
-
-
-# def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None:
-#     """Verify that any unused consumers are pruned each time a new event is sent
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     mock_storage = {}
-
-#     # note: file-system descriptors are just paths
-#     consumer_descriptor = storage_path / "test-consumer"
-
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-
-#     publisher = EventBroadcaster(
-#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
-#     )
-
-#     event = OnCreateConsumer(consumer_descriptor, [])
-
-#     # the only registered cnosumer is in the event, expect no pruning
-#     backbone.notification_channels = (consumer_descriptor,)
-
-#     publisher.send(event)
-#     assert str(consumer_descriptor) in publisher._channel_cache
-#     assert len(publisher._channel_cache) == 1
-
-#     # add a new descriptor for another event...
-#     consumer_descriptor2 = storage_path / "test-consumer-2"
-#     # ... and remove the old descriptor from the backbone when it's looked up
-#     backbone.notification_channels = (consumer_descriptor2,)
-
-#     event = OnCreateConsumer(consumer_descriptor2, [])
-
-#     publisher.send(event)
-
-#     assert str(consumer_descriptor2) in publisher._channel_cache
-#     assert str(consumer_descriptor) not in publisher._channel_cache
-#     assert len(publisher._channel_cache) == 1
-
-#     # test multi-consumer pruning by caching some extra channels
-#     prune0, prune1, prune2 = "abc", "def", "ghi"
-#     publisher._channel_cache[prune0] = "doesnt-matter-if-it-is-pruned"
-#     publisher._channel_cache[prune1] = "doesnt-matter-if-it-is-pruned"
-#     publisher._channel_cache[prune2] = "doesnt-matter-if-it-is-pruned"
-
-#     # add in one of our old channels so we prune the above items, send to these
-#     backbone.notification_channels = (consumer_descriptor, consumer_descriptor2)
-
-#     publisher.send(event)
-
-#     assert str(consumer_descriptor2) in publisher._channel_cache
-
-#     # NOTE: we should NOT prune something that isn't used by this message but
-#     # does appear in `backbone.notification_channels`
-#     assert str(consumer_descriptor) in publisher._channel_cache
-
-#     # confirm all of our items that were not in the notification channels are gone
-#     for pruned in [prune0, prune1, prune2]:
-#         assert pruned not in publisher._channel_cache
-
-#     # confirm we have only the two expected items in the channel cache
-#     assert len(publisher._channel_cache) == 2
-
-
-# def test_eventpublisher_serialize_failure(
-#     test_dir: str, monkeypatch: pytest.MonkeyPatch
-# ) -> None:
-#     """Verify that errors during message serialization are raised to the caller
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs
-#     :param monkeypatch: pytest fixture for modifying behavior of existing code
-#     with mock implementations"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     storage_path.mkdir(parents=True, exist_ok=True)
-
-#     mock_storage = {}
-
-#     # note: file-system descriptors are just paths
-#     target_descriptor = str(storage_path / "test-consumer")
-
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     publisher = EventBroadcaster(
-#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
-#     )
-
-#     with monkeypatch.context() as patch:
-#         event = OnCreateConsumer(target_descriptor, [])
-
-#         # patch the __bytes__ implementation to cause pickling to fail during send
-#         patch.setattr(event, "__bytes__", lambda x: b"abc")
-
-#         backbone.notification_channels = (target_descriptor,)
-
-#         # send a message into the channel
-#         with pytest.raises(ValueError) as ex:
-#             publisher.send(event)
-
-#         assert "serialize" in ex.value.args[0]
-
-
-# def test_eventpublisher_factory_failure(
-#     test_dir: str, monkeypatch: pytest.MonkeyPatch
-# ) -> None:
-#     """Verify that errors during channel construction are raised to the caller
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs
-#     :param monkeypatch: pytest fixture for modifying behavior of existing code
-#     with mock implementations"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     storage_path.mkdir(parents=True, exist_ok=True)
-
-#     mock_storage = {}
-
-#     # note: file-system descriptors are just paths
-#     target_descriptor = str(storage_path / "test-consumer")
-
-#     def boom(descriptor: str) -> None:
-#         raise Exception(f"you shall not pass! {descriptor}")
-
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     publisher = EventBroadcaster(backbone, channel_factory=boom)
-
-#     with monkeypatch.context() as patch:
-#         event = OnCreateConsumer(target_descriptor, [])
-
-#         backbone.notification_channels = (target_descriptor,)
-
-#         # send a message into the channel
-#         with pytest.raises(SmartSimError) as ex:
-#             publisher.send(event)
-
-#         assert "construct" in ex.value.args[0]
-
-
-# def test_eventpublisher_failure(test_dir: str, monkeypatch: pytest.MonkeyPatch) -> None:
-#     """Verify that unexpected errors during message send are caught and wrapped in a
-#     SmartSimError so they are not propagated directly to the caller
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs
-#     :param monkeypatch: pytest fixture for modifying behavior of existing code
-#     with mock implementations"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     storage_path.mkdir(parents=True, exist_ok=True)
-
-#     mock_storage = {}
-
-#     # note: file-system descriptors are just paths
-#     target_descriptor = str(storage_path / "test-consumer")
-
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     publisher = EventBroadcaster(
-#         backbone, channel_factory=FileSystemCommChannel.from_descriptor
-#     )
-
-#     def boom(self) -> None:
-#         raise Exception("That was unexpected...")
-
-#     with monkeypatch.context() as patch:
-#         event = OnCreateConsumer(target_descriptor, [])
-
-#         # patch the _broadcast implementation to cause send to fail after
-#         # after the event has been pickled
-#         patch.setattr(publisher, "_broadcast", boom)
-
-#         backbone.notification_channels = (target_descriptor,)
-
-#         # Here, we see the exception raised by broadcast that isn't expected
-#         # is not allowed directly out, and instead is wrapped in SmartSimError
-#         with pytest.raises(SmartSimError) as ex:
-#             publisher.send(event)
-
-#         assert "unexpected" in ex.value.args[0]
-
-
-# def test_eventconsumer_receive(test_dir: str) -> None:
-#     """Verify that a consumer retrieves a message from the given channel
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     storage_path.mkdir(parents=True, exist_ok=True)
-
-#     mock_storage = {}
-
-#     # note: file-system descriptors are just paths
-#     target_descriptor = str(storage_path / "test-consumer")
-
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor)
-#     event = OnCreateConsumer(target_descriptor, [])
-
-#     # simulate a sent event by writing directly to the input comm channel
-#     comm_channel.send(bytes(event))
-
-#     consumer = EventConsumer(comm_channel, backbone)
-
-#     all_received: t.List[OnCreateConsumer] = consumer.receive()
-#     assert len(all_received) == 1
-
-#     # verify we received the same event that was raised
-#     assert all_received[0].category == event.category
-#     assert all_received[0].descriptor == event.descriptor
-
-
-# @pytest.mark.parametrize("num_sent", [0, 1, 2, 4, 8, 16])
-# def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None:
-#     """Verify that a consumer retrieves multiple message from the given channel
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs
-#     :param num_sent: parameterized value used to vary the number of events
-#     that are enqueued and validations are checked at multiple queue sizes"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     storage_path.mkdir(parents=True, exist_ok=True)
-
-#     mock_storage = {}
-
-#     # note: file-system descriptors are just paths
-#     target_descriptor = str(storage_path / "test-consumer")
-
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor)
-
-#     # simulate multiple sent events by writing directly to the input comm channel
-#     for _ in range(num_sent):
-#         event = OnCreateConsumer(target_descriptor, [])
-#         comm_channel.send(bytes(event))
-
-#     consumer = EventConsumer(comm_channel, backbone)
-
-#     all_received: t.List[OnCreateConsumer] = consumer.receive()
-#     assert len(all_received) == num_sent
-
-
-# def test_eventconsumer_receive_empty(test_dir: str) -> None:
-#     """Verify that a consumer receiving an empty message ignores the
-#     message and continues processing
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     storage_path.mkdir(parents=True, exist_ok=True)
-
-#     mock_storage = {}
-
-#     # note: file-system descriptors are just paths
-#     target_descriptor = str(storage_path / "test-consumer")
-
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor)
-
-#     # simulate a sent event by writing directly to the input comm channel
-#     comm_channel.send(bytes(b""))
-
-#     consumer = EventConsumer(comm_channel, backbone)
-
-#     messages = consumer.receive()
-
-#     # the messages array should be empty
-#     assert not messages
-
-
-# def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None:
-#     """Verify that the publisher and consumer integrate as expected when
-#     multiple publishers and consumers are sending simultaneously.
-
-#     :param test_dir: pytest fixture automatically generating unique working
-#     directories for individual test outputs"""
-#     storage_path = pathlib.Path(test_dir) / "features"
-#     storage_path.mkdir(parents=True, exist_ok=True)
-
-#     mock_storage = {}
-#     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-#     mock_fs_descriptor = str(storage_path / f"mock-feature-store")
-
-#     wmgr_channel = FileSystemCommChannel(storage_path / "test-wmgr")
-#     capp_channel = FileSystemCommChannel(storage_path / "test-capp")
-#     back_channel = FileSystemCommChannel(storage_path / "test-backend")
-
-#     wmgr_consumer_descriptor = wmgr_channel.descriptor
-#     capp_consumer_descriptor = capp_channel.descriptor
-#     back_consumer_descriptor = back_channel.descriptor
-
-#     # create some consumers to receive messages
-#     wmgr_consumer = EventConsumer(
-#         wmgr_channel,
-#         backbone,
-#         filters=[EventCategory.FEATURE_STORE_WRITTEN],
-#     )
-#     capp_consumer = EventConsumer(
-#         capp_channel,
-#         backbone,
-#     )
-#     back_consumer = EventConsumer(
-#         back_channel,
-#         backbone,
-#         filters=[EventCategory.CONSUMER_CREATED],
-#     )
-
-#     # create some broadcasters to publish messages
-#     mock_worker_mgr = EventBroadcaster(
-#         backbone,
-#         channel_factory=FileSystemCommChannel.from_descriptor,
-#     )
-#     mock_client_app = EventBroadcaster(
-#         backbone,
-#         channel_factory=FileSystemCommChannel.from_descriptor,
-#     )
-
-#     # register all of the consumers even though the OnCreateConsumer really should
-#     # trigger its registration. event processing is tested elsewhere.
-#     backbone.notification_channels = [
-#         wmgr_consumer_descriptor,
-#         capp_consumer_descriptor,
-#         back_consumer_descriptor,
-#     ]
-
-#     # simulate worker manager sending a notification to backend that it's alive
-#     event_1 = OnCreateConsumer(wmgr_consumer_descriptor, [])
-#     mock_worker_mgr.send(event_1)
-
-#     # simulate the app updating a model a few times
-#     event_2 = OnWriteFeatureStore(mock_fs_descriptor, "key-1")
-#     event_3 = OnWriteFeatureStore(mock_fs_descriptor, "key-2")
-#     event_4 = OnWriteFeatureStore(mock_fs_descriptor, "key-1")
-
-#     mock_client_app.send(event_2)
-#     mock_client_app.send(event_3)
-#     mock_client_app.send(event_4)
-
-#     # worker manager should only get updates about feature update
-#     wmgr_messages = wmgr_consumer.receive()
-#     assert len(wmgr_messages) == 3
-
-#     # the backend should only receive messages about consumer creation
-#     back_messages = back_consumer.receive()
-#     assert len(back_messages) == 1
-
-#     # hypothetical app has no filters and will get all events
-#     app_messages = capp_consumer.receive()
-#     assert len(app_messages) == 4
diff --git a/tests/test_message_handler/test_build_model_key.py b/tests/test_message_handler/test_build_model_key.py
index 092ae4fe0..6c9b3dc95 100644
--- a/tests/test_message_handler/test_build_model_key.py
+++ b/tests/test_message_handler/test_build_model_key.py
@@ -34,14 +34,14 @@
 handler = MessageHandler()
 
 
-def test_build_feature_store_key_successful():
+def test_build_model_key_successful():
     fsd = "mock-feature-store-descriptor"
-    model_key = handler.build_feature_store_key("tensor_key", fsd)
+    model_key = handler.build_model_key("tensor_key", fsd)
     assert model_key.key == "tensor_key"
     assert model_key.descriptor == fsd
 
 
-def test_build_feature_store_key_unsuccessful():
+def test_build_model_key_unsuccessful():
     with pytest.raises(ValueError):
         fsd = "mock-feature-store-descriptor"
-        model_key = handler.build_feature_store_key(100, fsd)
+        model_key = handler.build_model_key(100, fsd)
diff --git a/tests/test_message_handler/test_output_descriptor.py b/tests/test_message_handler/test_output_descriptor.py
index 2b5575965..beb9a4765 100644
--- a/tests/test_message_handler/test_output_descriptor.py
+++ b/tests/test_message_handler/test_output_descriptor.py
@@ -34,7 +34,7 @@
 handler = MessageHandler()
 
 fsd = "mock-feature-store-descriptor"
-tensor_key = handler.build_feature_store_key("key", fsd)
+tensor_key = handler.build_tensor_key("key", fsd)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py
index 751722534..a60818f7d 100644
--- a/tests/test_message_handler/test_request.py
+++ b/tests/test_message_handler/test_request.py
@@ -33,14 +33,14 @@
 
 fsd = "mock-feature-store-descriptor"
 
-model_key = MessageHandler.build_feature_store_key("model_key", fsd)
+model_key = MessageHandler.build_model_key("model_key", fsd)
 model = MessageHandler.build_model(b"model data", "model_name", "v0.0.1")
 
-input_key1 = MessageHandler.build_feature_store_key("input_key1", fsd)
-input_key2 = MessageHandler.build_feature_store_key("input_key2", fsd)
+input_key1 = MessageHandler.build_tensor_key("input_key1", fsd)
+input_key2 = MessageHandler.build_tensor_key("input_key2", fsd)
 
-output_key1 = MessageHandler.build_feature_store_key("output_key1", fsd)
-output_key2 = MessageHandler.build_feature_store_key("output_key2", fsd)
+output_key1 = MessageHandler.build_tensor_key("output_key1", fsd)
+output_key2 = MessageHandler.build_tensor_key("output_key2", fsd)
 
 output_descriptor1 = MessageHandler.build_output_tensor_descriptor(
     "c", [output_key1, output_key2], "int64", []
diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py
index d0305407c..86774132e 100644
--- a/tests/test_message_handler/test_response.py
+++ b/tests/test_message_handler/test_response.py
@@ -33,8 +33,8 @@
 
 fsd = "mock-feature-store-descriptor"
 
-result_key1 = MessageHandler.build_feature_store_key("result_key1", fsd)
-result_key2 = MessageHandler.build_feature_store_key("result_key2", fsd)
+result_key1 = MessageHandler.build_tensor_key("result_key1", fsd)
+result_key2 = MessageHandler.build_tensor_key("result_key2", fsd)
 
 torch_attributes = MessageHandler.build_torch_response_attributes()
 tf_attributes = MessageHandler.build_tf_response_attributes()

From fd0a5ecbe5ad7596adb0067aacbdc77cf9c4e8ea Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Thu, 19 Sep 2024 20:43:57 -0500
Subject: [PATCH 03/40] capture dragon start method ex

---
 tests/dragon/test_request_dispatcher.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index e666710e6..54dfcb68d 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -85,7 +85,10 @@
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
 
-mp.set_start_method("dragon")
+try:
+    mp.set_start_method("dragon")
+except Exception:
+    pass
 
 
 class MiniModel(nn.Module):

From 00a4496678d13e2b686cd3191097b9172b563ac1 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Thu, 19 Sep 2024 23:34:29 -0500
Subject: [PATCH 04/40] Split message pump into separate module from dispatcher
 test

---
 tests/dragon/test_request_dispatcher.py | 179 +++-------------------
 tests/dragon/utils/msg_pump.py          | 194 ++++++++++++++++++++++++
 2 files changed, 217 insertions(+), 156 deletions(-)
 create mode 100644 tests/dragon/utils/msg_pump.py

diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index 54dfcb68d..352b2d538 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -25,11 +25,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import gc
-import io
 import logging
 import os
 import pathlib
-import socket
+import subprocess as sp
+import sys
 import time
 import typing as t
 from queue import Empty
@@ -37,7 +37,6 @@
 import numpy as np
 import pytest
 
-pytest.importorskip("torch")
 pytest.importorskip("dragon")
 
 
@@ -48,16 +47,10 @@
 
 # isort: on
 
-import dragon.channels as dch
-import dragon.infrastructure.policy as dragon_policy
-import dragon.infrastructure.process_desc as dragon_process_desc
-import dragon.native.process as dragon_process
-import torch.nn as nn
 from dragon import fli
 from dragon.data.ddict.ddict import DDict
 from dragon.managed_memory import MemoryAlloc
 
-from smartsim._core.entrypoints.service import Service
 from smartsim._core.mli.comm.channel.dragon_channel import (
     DragonCommChannel,
     create_local,
@@ -76,9 +69,7 @@
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
-from smartsim._core.mli.message_handler import MessageHandler
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
@@ -91,142 +82,6 @@
     pass
 
 
-class MiniModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-        self._name = "mini-model"
-        self._net = torch.nn.Linear(2, 1)
-
-    def forward(self, input):
-        return self._net(input)
-
-    @property
-    def bytes(self) -> bytes:
-        """Returns the model serialized to a byte stream"""
-        buffer = io.BytesIO()
-        scripted = torch.jit.trace(self._net, self.get_batch())
-        torch.jit.save(scripted, buffer)
-        return buffer.getvalue()
-
-    @classmethod
-    def get_batch(cls) -> "torch.Tensor":
-        return torch.randn((100, 2), dtype=torch.float32)
-
-
-def load_model() -> bytes:
-    """Create a simple torch model in memory for testing"""
-    mini_model = MiniModel()
-    return mini_model.bytes
-
-
-def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
-    """Create a simple torch model and persist to disk for
-    testing purposes.
-
-    TODO: remove once unit tests are in place"""
-    # test_path = pathlib.Path(work_dir)
-    if not model_path.parent.exists():
-        model_path.parent.mkdir(parents=True, exist_ok=True)
-
-    model_path.unlink(missing_ok=True)
-
-    model = torch.nn.Linear(2, 1)
-    torch.save(model, model_path)
-
-    return model_path
-
-
-def mock_messages(
-    request_dispatcher_queue: DragonFLIChannel,
-    feature_store: FeatureStore,
-    parent_iteration: int,
-    callback_descriptor: str,
-) -> None:
-    """Mock event producer for triggering the inference pipeline"""
-    model_key = "mini-model"
-    # mock_message sends 2 messages, so we offset by 2 * (# of iterations in caller)
-    offset = 2 * parent_iteration
-
-    for iteration_number in range(2):
-        logged_iteration = offset + iteration_number
-        logger.debug(f"Sending mock message {logged_iteration}")
-
-        output_key = f"output-{iteration_number}"
-
-        feature_store[model_key] = load_model()
-
-        tensor = (
-            (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32)
-        ).numpy()
-        fsd = feature_store.descriptor
-
-        tensor_desc = MessageHandler.build_tensor_descriptor(
-            "c", "float32", list(tensor.shape)
-        )
-
-        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
-        message_model_key = MessageHandler.build_model_key(model_key, fsd)
-
-        request = MessageHandler.build_request(
-            reply_channel=callback_descriptor,
-            model=message_model_key,
-            inputs=[tensor_desc],
-            outputs=[message_tensor_output_key],
-            output_descriptors=[],
-            custom_attributes=None,
-        )
-
-        logger.info(f"Sending request {iteration_number} to request_dispatcher_queue")
-        request_bytes = MessageHandler.serialize_request(request)
-        with request_dispatcher_queue._fli.sendh(
-            timeout=None, stream_channel=request_dispatcher_queue._channel
-        ) as sendh:
-            sendh.send_bytes(request_bytes)
-            sendh.send_bytes(tensor.tobytes())
-
-        logger.info(
-            f"Retrieving {iteration_number} from callback channel: {callback_descriptor}"
-        )
-        callback_channel = DragonCommChannel.from_descriptor(callback_descriptor)
-
-        # Results will be empty. The test pulls messages off the queue before they
-        # can be serviced by a worker. Just ensure the callback channel works.
-        results = callback_channel.recv(timeout=0.1)
-        logger.debug(f"Received mock message results on callback channel: {results}")
-        time.sleep(1)
-
-
-@pytest.fixture
-def prepare_environment(test_dir: str) -> pathlib.Path:
-    """Cleanup prior outputs to run demo repeatedly"""
-    path = pathlib.Path(f"{test_dir}/workermanager.log")
-    logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
-    return path
-
-
-def service_as_dragon_proc(
-    service: Service, cpu_affinity: list[int], gpu_affinity: list[int]
-) -> dragon_process.Process:
-
-    options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
-    local_policy = dragon_policy.Policy(
-        placement=dragon_policy.Policy.Placement.HOST_NAME,
-        host_name=socket.gethostname(),
-        cpu_affinity=cpu_affinity,
-        gpu_affinity=gpu_affinity,
-    )
-    return dragon_process.Process(
-        target=service.execute,
-        args=[],
-        cwd=os.getcwd(),
-        policy=local_policy,
-        options=options,
-        stderr=dragon_process.Popen.STDOUT,
-        stdout=dragon_process.Popen.STDOUT,
-    )
-
-
 def test_request_dispatcher() -> None:
     """Test the request dispatcher batching and queueing system
 
@@ -279,15 +134,27 @@ def test_request_dispatcher() -> None:
 
         callback_channel = DragonCommChannel.from_local()
 
-        # create a mock client application to populate the request queue
-        msg_pump = mp.Process(
-            target=mock_messages,
-            args=(worker_queue, backbone_fs, i, callback_channel.descriptor),
+        fp = pathlib.Path(__file__).parent / "utils" / "msg_pump.py"
+        cmd = [
+            sys.executable,
+            str(fp.absolute()),
+            "--dispatch-fli-descriptor",
+            worker_queue.descriptor,
+            "--fs-descriptor",
+            backbone_fs.descriptor,
+            "--parent-iteration",
+            str(i),
+            "--callback-descriptor",
+            callback_channel.descriptor,
+        ]
+
+        popen = sp.Popen(
+            args=cmd,
+            stdout=sp.PIPE,
+            stderr=sp.PIPE,
         )
 
-        msg_pump.start()
-
-        time.sleep(1)
+        time.sleep(2)
 
         for _ in range(200):
             try:
@@ -347,8 +214,6 @@ def test_request_dispatcher() -> None:
             for mem_alloc in mem_allocs:
                 mem_alloc.free()
 
-            msg_pump.kill()
-
         request_dispatcher._active_queues[model_key].make_disposable()
         assert request_dispatcher._active_queues[model_key].can_be_removed
 
@@ -357,6 +222,8 @@ def test_request_dispatcher() -> None:
         assert model_key not in request_dispatcher._active_queues
         assert model_key not in request_dispatcher._queues
 
+    popen.wait()
+
     # Try to remove the dispatcher and free the memory
     del request_dispatcher
     gc.collect()
diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py
new file mode 100644
index 000000000..e54cdf7fd
--- /dev/null
+++ b/tests/dragon/utils/msg_pump.py
@@ -0,0 +1,194 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import logging
+import pathlib
+import time
+import typing as t
+
+import pytest
+
+pytest.importorskip("torch")
+pytest.importorskip("dragon")
+
+
+# isort: off
+import dragon
+import multiprocessing as mp
+import torch
+import torch.nn as nn
+
+# isort: on
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+logger = get_logger(__name__, log_level=logging.DEBUG)
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+try:
+    mp.set_start_method("dragon")
+except Exception:
+    pass
+
+
+class MiniModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self._name = "mini-model"
+        self._net = torch.nn.Linear(2, 1)
+
+    def forward(self, input):
+        return self._net(input)
+
+    @property
+    def bytes(self) -> bytes:
+        """Returns the model serialized to a byte stream"""
+        buffer = io.BytesIO()
+        scripted = torch.jit.trace(self._net, self.get_batch())
+        torch.jit.save(scripted, buffer)
+        return buffer.getvalue()
+
+    @classmethod
+    def get_batch(cls) -> "torch.Tensor":
+        return torch.randn((100, 2), dtype=torch.float32)
+
+
+def load_model() -> bytes:
+    """Create a simple torch model in memory for testing"""
+    mini_model = MiniModel()
+    return mini_model.bytes
+
+
+def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
+    """Create a simple torch model and persist to disk for
+    testing purposes.
+
+    TODO: remove once unit tests are in place"""
+    # test_path = pathlib.Path(work_dir)
+    if not model_path.parent.exists():
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+
+    model_path.unlink(missing_ok=True)
+
+    model = torch.nn.Linear(2, 1)
+    torch.save(model, model_path)
+
+    return model_path
+
+
+def mock_messages(
+    dispatch_fli_descriptor: str,
+    fs_descriptor: str,
+    parent_iteration: int,
+    callback_descriptor: str,
+) -> None:
+    """Mock event producer for triggering the inference pipeline"""
+    model_key = "mini-model"
+    # mock_message sends 2 messages, so we offset by 2 * (# of iterations in caller)
+    offset = 2 * parent_iteration
+
+    feature_store = BackboneFeatureStore.from_descriptor(fs_descriptor)
+    request_dispatcher_queue = DragonFLIChannel.from_sender_supplied_descriptor(
+        dispatch_fli_descriptor
+    )
+
+    for iteration_number in range(2):
+        logged_iteration = offset + iteration_number
+        logger.debug(f"Sending mock message {logged_iteration}")
+
+        output_key = f"output-{iteration_number}"
+
+        feature_store[model_key] = load_model()
+
+        tensor = (
+            (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32)
+        ).numpy()
+        fsd = feature_store.descriptor
+
+        tensor_desc = MessageHandler.build_tensor_descriptor(
+            "c", "float32", list(tensor.shape)
+        )
+
+        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
+        message_model_key = MessageHandler.build_model_key(model_key, fsd)
+
+        request = MessageHandler.build_request(
+            reply_channel=callback_descriptor,
+            model=message_model_key,
+            inputs=[tensor_desc],
+            outputs=[message_tensor_output_key],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+
+        logger.info(f"Sending request {iteration_number} to request_dispatcher_queue")
+        request_bytes = MessageHandler.serialize_request(request)
+        with request_dispatcher_queue._fli.sendh(
+            timeout=None, stream_channel=request_dispatcher_queue._channel
+        ) as sendh:
+            sendh.send_bytes(request_bytes)
+            sendh.send_bytes(tensor.tobytes())
+
+        logger.info(
+            f"Retrieving {iteration_number} from callback channel: {callback_descriptor}"
+        )
+        callback_channel = DragonCommChannel.from_descriptor(callback_descriptor)
+
+        # Results will be empty. The test pulls messages off the queue before they
+        # can be serviced by a worker. Just ensure the callback channel works.
+        results = callback_channel.recv(timeout=0.1)
+        logger.debug(f"Received mock message results on callback channel: {results}")
+        time.sleep(1)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    args = argparse.ArgumentParser()
+
+    args.add_argument("--dispatch-fli-descriptor", type=str)
+    args.add_argument("--fs-descriptor", type=str)
+    args.add_argument("--parent-iteration", type=int)
+    args.add_argument("--callback-descriptor", type=str)
+
+    args = args.parse_args()
+
+    mock_messages(
+        args.dispatch_fli_descriptor,
+        args.fs_descriptor,
+        args.parent_iteration,
+        args.callback_descriptor,
+    )

From a87aba7a0a83e17e8ff679104e9170bcf0681bbe Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Mon, 23 Sep 2024 16:28:12 -0500
Subject: [PATCH 05/40] extract msg_pump_factory for reuse in
 test_worker_manager.py

---
 conftest.py                             | 74 ++++++++++++++++++++++++-
 tests/dragon/test_request_dispatcher.py | 46 ++++++++-------
 2 files changed, 99 insertions(+), 21 deletions(-)

diff --git a/conftest.py b/conftest.py
index 991c0d17b..622dd7a7c 100644
--- a/conftest.py
+++ b/conftest.py
@@ -227,7 +227,6 @@ def kill_all_test_spawned_processes() -> None:
         print("Not all processes were killed after test")
 
 
-
 def get_hostlist() -> t.Optional[t.List[str]]:
     global test_hostlist
     if not test_hostlist:
@@ -1022,3 +1021,76 @@ def _prepare_db(db_config: DBConfiguration) -> PrepareDatabaseOutput:
 
         return PrepareDatabaseOutput(db, new_db)
     return _prepare_db
+
+
+class MsgPumpRequest(t.NamedTuple):
+    """Fields required for starting a simulated inference request producer."""
+
+    backbone_descriptor: str
+    """The descriptor to use when connecting the message pump to a 
+    backbone featurestore.
+    
+    Passed to the message pump as `--fs-descriptor`
+    """
+    work_queue_descriptor: str
+    """The descriptor to use for sending work from the pump to the worker manager.
+    
+    Passed to the message pump as `--dispatch-fli-descriptor`
+    """
+    callback_descriptor: str
+    """The descriptor the worker should use to returning results.
+    
+    Passed to the message pump as `--callback-descriptor`
+    """
+    iteration_index: int = 1
+    """If calling the message pump repeatedly, supply an iteration index to ensure
+    that logged messages appear unique instead of apparing to be duplicated logs.
+    
+    Passed to the message pump as `--parent-iteration`
+    """
+
+    def as_command(self) -> t.List[str]:
+        """Produce CLI arguments suitable for calling subprocess.Popen that
+        to execute the msg pump.
+
+        NOTE: does NOT include the `[sys.executable, msg_pump_path, ...]`
+        portion of the necessary parameters to Popen.
+
+        :returns: A list of strings containing the arguments of the request
+        formatted for inclusion in a call to subprocess.Popen"""
+        return [
+            "--dispatch-fli-descriptor",
+            self.work_queue_descriptor,
+            "--fs-descriptor",
+            self.backbone_descriptor,
+            "--parent-iteration",
+            str(self.iteration_index),
+            "--callback-descriptor",
+            self.callback_descriptor,
+        ]
+
+
+@pytest.fixture(scope="session")
+def msg_pump_factory() -> t.Callable[[MsgPumpRequest], subprocess.Popen]:
+    """A pytest fixture used to create a mock event producer capable of
+    feeding asynchronous inference requests to tests requiring them.
+
+    :returns: A function that can be passed appropriate descriptors
+    for starting a message pump."""
+
+    def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen:
+        """Invokes the message pump entry-point"""
+        # <smartsim_dir>/tests/dragon/utils/msg_pump.py
+        msg_pump_script = "tests/dragon/utils/msg_pump.py"
+        msg_pump_path = pathlib.Path(__file__).parent / msg_pump_script
+
+        cmd = [sys.executable, str(msg_pump_path.absolute()), *request.as_command()]
+
+        popen = subprocess.Popen(
+            args=cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        return popen
+
+    return run_message_pump
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index 352b2d538..e111f8c74 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -37,12 +37,15 @@
 import numpy as np
 import pytest
 
+import conftest
+
 pytest.importorskip("dragon")
 
 
 # isort: off
 import dragon
 import multiprocessing as mp
+
 import torch
 
 # isort: on
@@ -73,16 +76,23 @@
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
+mock_msg_pump_path = pathlib.Path(__file__).parent / "utils" / "msg_pump.py"
+_MsgPumpFactory = t.Callable[[conftest.MsgPumpRequest], sp.Popen]
+
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
 
+
 try:
     mp.set_start_method("dragon")
 except Exception:
     pass
 
 
-def test_request_dispatcher() -> None:
+@pytest.mark.parametrize("num_iterations", [4])
+def test_request_dispatcher(
+    msg_pump_factory: _MsgPumpFactory, num_iterations: int
+) -> None:
     """Test the request dispatcher batching and queueing system
 
     This also includes setting a queue to disposable, checking that it is no
@@ -123,8 +133,9 @@ def test_request_dispatcher() -> None:
         )
 
     request_dispatcher._on_start()
+    pump_processes: t.List[sp.Popen] = []
 
-    for i in range(2):
+    for i in range(num_iterations):
         batch: t.Optional[RequestBatch] = None
         mem_allocs = []
         tensors = []
@@ -134,27 +145,17 @@ def test_request_dispatcher() -> None:
 
         callback_channel = DragonCommChannel.from_local()
 
-        fp = pathlib.Path(__file__).parent / "utils" / "msg_pump.py"
-        cmd = [
-            sys.executable,
-            str(fp.absolute()),
-            "--dispatch-fli-descriptor",
-            worker_queue.descriptor,
-            "--fs-descriptor",
+        request = conftest.MsgPumpRequest(
             backbone_fs.descriptor,
-            "--parent-iteration",
-            str(i),
-            "--callback-descriptor",
+            worker_queue.descriptor,
             callback_channel.descriptor,
-        ]
-
-        popen = sp.Popen(
-            args=cmd,
-            stdout=sp.PIPE,
-            stderr=sp.PIPE,
+            i,
         )
 
-        time.sleep(2)
+        msg_pump = msg_pump_factory(request)
+        pump_processes.append(msg_pump)
+
+        time.sleep(1)
 
         for _ in range(200):
             try:
@@ -222,7 +223,12 @@ def test_request_dispatcher() -> None:
         assert model_key not in request_dispatcher._active_queues
         assert model_key not in request_dispatcher._queues
 
-    popen.wait()
+        msg_pump.wait()
+
+    for msg_pump in pump_processes:
+        if msg_pump.returncode is not None:
+            continue
+        msg_pump.terminate()
 
     # Try to remove the dispatcher and free the memory
     del request_dispatcher

From 48791226307e987092bc2f3b0219917772f2c33e Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 24 Sep 2024 14:33:57 -0500
Subject: [PATCH 06/40] add test verifying protoclient event raises

---
 .../_core/launcher/dragon/dragonBackend.py    |  30 +-
 .../storage/backbone_feature_store.py         |  57 ++-
 smartsim/protoclient.py                       | 109 ++--
 tests/dragon/test_protoclient.py              | 473 ++++++++++--------
 4 files changed, 377 insertions(+), 292 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 0f8121ab5..a1367af2a 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -277,6 +277,7 @@ def status_message(self) -> str:
         )
 
     def _heartbeat(self) -> None:
+        """Update the value of the last heartbeat to the current time."""
         self._last_beat = self.current_time
 
     @property
@@ -621,38 +622,15 @@ def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer:
                 name="BackendConsumerRegistrar",
                 event_handler=self._on_consumer_created,
             )
+            while consumer.register():
+                # wait for the consumer to complete registration
+                ...
 
             # self._backbone.backend_channel =
             # consumer.descriptor # i want to get rid of this extra channel
             # self._bootstrap_event_listeners(backbone, consumer)
             self._event_consumer = consumer
 
-            # options = dragon_process_desc.
-            # ProcessOptions(make_inf_channels=True) # what is this!?
-            # grp_consumer = dragon_process_group.ProcessGroup(
-            #     restart=False, pmi_enabled=False
-            # )
-            # self._event_consumer_process = dragon_process.ProcessTemplate(
-            #     target=self._event_consumer.listen,
-            #     # args=request.exe_args,
-            #     # cwd=request.path,
-            #     env={
-            #         # **request.current_env,
-            #         # **request.env,
-            #         **self._backbone.get_env(),
-            #     },
-            #     stdout=dragon_process.Popen.PIPE,
-            #     stderr=dragon_process.Popen.PIPE,
-            #     # policy=local_policy,
-            #     options=options,
-            # )
-            # grp_consumer.add(self._event_consumer_process)
-            # # self._event_consumer_process =
-            # mp.Process(target=self._event_consumer.listen)
-            # # self._event_consumer_process.start()
-            # grp_consumer.init()
-            # grp_consumer.start()
-
             logger.info("Created event consumer")
 
         return self._event_consumer
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index 9cc8a6bf9..e48f4e4e9 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -104,7 +104,7 @@ def notification_channels(self) -> t.Sequence[str]:
 
         :returns: The list of descriptors
         """
-        if "_SMARTSIM_MLI_NOTIFY_CONSUMERS" in self:
+        if self.MLI_NOTIFY_CONSUMERS in self:
             stored_consumers = self[self.MLI_NOTIFY_CONSUMERS]
             return str(stored_consumers).split(",")
         return []
@@ -367,6 +367,8 @@ def send(self, event: EventBase) -> int:
             raise Exception("No channel to send on")
         num_sent = 0
 
+        logger.debug(f"Sending {event} to {self._channel.descriptor}")
+
         try:
             event_bytes = bytes(event)
             self._channel.send(event_bytes)
@@ -399,7 +401,7 @@ def __init__(
         )
         """A mapping of instantiated channels that can be re-used. Automatically 
         calls the channel factory if a descriptor is not already in the collection"""
-        self._event_buffer: t.Deque[bytes] = deque()
+        self._event_buffer: t.Deque[EventBase] = deque()
         """A buffer for storing events when a consumer list is not found"""
         self._descriptors: t.Set[str]
         """Stores the most recent list of broadcast consumers. Updated automatically
@@ -416,15 +418,15 @@ def num_buffered(self) -> int:
         return len(self._event_buffer)
 
     def _save_to_buffer(self, event: EventBase) -> None:
-        """Places a serialized event in the buffer to be sent once a consumer
+        """Places the event in the buffer to be sent once a consumer
         list is available.
 
         :param event: The event to serialize and buffer
         :raises ValueError: If the event cannot be serialized
         """
         try:
-            event_bytes = bytes(event)
-            self._event_buffer.append(event_bytes)
+            self._event_buffer.append(event)
+            logger.debug(f"Buffered event {event=}")
         except Exception as ex:
             raise ValueError(f"Unable to serialize event from {self._uid}") from ex
 
@@ -459,7 +461,7 @@ def _get_comm_channel(self, descriptor: str) -> CommChannelBase:
 
         :param descriptor: The descriptor to pass to the channel factory
         :returns: The instantiated channel
-        :raises SmartSimError: If the channel fails to build
+        :raises SmartSimError: If the channel fails to attach
         """
         comm_channel = self._channel_cache[descriptor]
         if comm_channel is not None:
@@ -477,11 +479,24 @@ def _get_comm_channel(self, descriptor: str) -> CommChannelBase:
             logger.error(msg, exc_info=True)
             raise SmartSimError(msg) from ex
 
+    def _get_next_event_event(self) -> t.Optional[EventBase]:
+        """Pop the next event to be sent from the queue.
+
+        :returns: The next event to send if any events are enqueued, otherwise `None`.
+        """
+        try:
+            return self._event_buffer.popleft()
+        except IndexError:
+            logger.debug(f"Broadcast buffer exhausted for {self._uid}")
+
+        return None
+
     def _broadcast(self, timeout: float = 0.001) -> int:
         """Broadcasts all buffered events to registered event consumers.
 
         :param timeout: Maximum time to wait (in seconds) for messages to send
         :returns: The number of events broadcasted to consumers
+        :raises SmartSimError: If the channel fails to attach
         :raises SmartSimError: If broadcasting fails
         """
         # allow descriptors to be empty since events are buffered
@@ -493,31 +508,26 @@ def _broadcast(self, timeout: float = 0.001) -> int:
         self._prune_unused_consumers()
         self._log_broadcast_start()
 
-        num_sent: int = 0
-        next_event: t.Optional[bytes] = self._event_buffer.popleft()
+        num_sent = 0
+        num_listeners = len(self._descriptors)
 
         # send each event to every consumer
-        while next_event is not None:
-            for descriptor in map(str, self._descriptors):
+        while event := self._get_next_event_event():
+            logger.debug(f"Broadcasting {event=} to {num_listeners} listeners")
+            event_bytes = bytes(event)
+
+            for i, descriptor in enumerate(self._descriptors):
                 comm_channel = self._get_comm_channel(descriptor)
 
                 try:
-                    # todo: given a failure, the message is not sent to any other
-                    # recipients. consider retrying, adding a dead letter queue, or
-                    # logging the message details more intentionally
-                    comm_channel.send(next_event, timeout)
+                    comm_channel.send(event_bytes, timeout)
                     num_sent += 1
                 except Exception as ex:
                     raise SmartSimError(
-                        f"Failed broadcast to channel {descriptor} from {self._uid}"
+                        f"Broadcast {i}/{num_listeners} for event {event.uid} to "
+                        f"channel  {descriptor} from {self._uid} failed."
                     ) from ex
 
-            try:
-                next_event = self._event_buffer.popleft()
-            except IndexError:
-                next_event = None
-                logger.debug(f"Broadcast buffer exhausted for {self._uid}")
-
         return num_sent
 
     def send(self, event: EventBase, timeout: float = 0.001) -> int:
@@ -629,9 +639,12 @@ def register(self) -> t.Generator[bool, None, None]:
         """Send an event to register this consumer as a listener"""
         awaiting_confirmation = True
         descriptor = self._comm_channel.descriptor
-        backoffs = itertools.cycle((0.1, 0.5, 1.0, 2.0, 4.0, 8.0))
+        backoffs = itertools.cycle((0.1, 0.5, 1.0, 2.0, 4.0))
         event = OnCreateConsumer(descriptor, self._global_filters)
 
+        # create a temporary publisher to broadcast my own existence.
+        publisher = EventBroadcaster(self._backbone, DragonCommChannel.from_local)
+
         # we're going to sit in this loop to wait for the backbone to get
         # updated with the registration (to avoid SEND/ACK)
         while awaiting_confirmation:
diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py
index b0e235f8c..c2b7ebaf0 100644
--- a/smartsim/protoclient.py
+++ b/smartsim/protoclient.py
@@ -43,7 +43,10 @@
 import numpy
 import torch
 
-from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_channel import (
+    create_local,
+    DragonCommChannel,
+)
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
@@ -56,8 +59,12 @@
 from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
 
-# from mpi4py import MPI
 
+try:
+    from mpi4py import MPI
+except Exception:
+    MPI = None
+    print("Unable to import `mpi4py` package")
 
 _TimingDict = OrderedDict[str, list[str]]
 
@@ -68,7 +75,16 @@
 
 
 class ProtoClient:
-    _DEFAULT_TIMEOUT = 30.0
+    """Proof of concept implementation of a client enabling user applications
+    to interact with MLI resources."""
+
+    _DEFAULT_BACKBONE_TIMEOUT = 30.0
+    """A default timeout period applied to connection attempts with the
+    backbone feature store."""
+
+    _DEFAULT_WORK_QUEUE_SIZE = 500
+    """A default number of events to be buffered in the work queue before
+    triggering QueueFull exceptions."""
 
     @staticmethod
     def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore:
@@ -82,7 +98,8 @@ def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore:
         descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None)
         if descriptor is None:
             raise SmartSimError(
-                "Missing required backbone configuration in environment"
+                "Missing required backbone configuration in environment: "
+                f"{BackboneFeatureStore.MLI_BACKBONE}"
             )
 
         backbone = t.cast(
@@ -94,37 +111,43 @@ def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore:
     def _attach_to_worker_queue(self) -> DragonFLIChannel:
         """Wait until the backbone contains the worker queue configuration,
         then attach an FLI to the given worker queue"""
-        configuration = self._backbone.wait_for(
-            [BackboneFeatureStore.MLI_WORKER_QUEUE], self._timeout
-        )
-        # descriptor = configuration.get(BackboneFeatureStore.MLI_WORKER_QUEUE, None)
-        # NOTE: without wait_for, this MUST be in the backbone....
-        # descriptor = self._backbone.worker_queue
-        descriptor = str(configuration[BackboneFeatureStore.MLI_WORKER_QUEUE])
 
-        if not descriptor:
-            raise ValueError("Unable to locate worker queue using backbone")
+        descriptor = ""
+        try:
+            # NOTE: without wait_for, this MUST be in the backbone....
+            config = self._backbone.wait_for(
+                [BackboneFeatureStore.MLI_WORKER_QUEUE], self.backbone_timeout
+            )
+            descriptor = str(config[BackboneFeatureStore.MLI_WORKER_QUEUE])
+        except Exception as ex:
+            logger.info(
+                f"Unable to rerieve {BackboneFeatureStore.MLI_WORKER_QUEUE} "
+                "to attach to the worker queue."
+            )
+            raise ValueError("Unable to locate worker queue using backbone") from ex
 
-        # self._to_worker_fli = DragonFLIChannel.from_descriptor(descriptor)
-        return DragonFLIChannel.from_descriptor(str(descriptor))
+        return DragonFLIChannel.from_descriptor(descriptor)
 
-    @staticmethod
-    def _create_worker_channels() -> t.Tuple[DragonCommChannel, DragonCommChannel]:
-        """Create channels to be used in the worker queue"""
-        # self._from_worker_ch = Channel.make_process_local()
-        _from_worker_ch = DragonCommChannel.from_local()
-        # self._from_worker_ch_serialized = self._from_worker_ch.serialize()
-        # self._to_worker_ch = Channel.make_process_local()
-        _to_worker_ch = DragonCommChannel.from_local()
+    @classmethod
+    def _create_worker_channels(
+        cls,
+    ) -> t.Tuple[dragon.channels.Channel, dragon.channels.Channel]:
+        """Create channels to be used for communication to and from the worker queue.
 
-        return _from_worker_ch, _to_worker_ch
+        :returns: A tuple containing the native from and to Channels as (from_channel, to_channel).
+        """
+
+        _from_worker_ch_raw = create_local(cls._DEFAULT_WORK_QUEUE_SIZE)
+        _to_worker_ch_raw = create_local(cls._DEFAULT_WORK_QUEUE_SIZE)
+
+        return _from_worker_ch_raw, _to_worker_ch_raw
 
     def _create_broadcaster(self) -> EventProducer:
         """Create an event publisher that will broadcast updates to
         other MLI components. This publisher
 
         :returns: the event publisher instance"""
-        broadcaster: EventProducer = EventBroadcaster(
+        broadcaster = EventBroadcaster(
             self._backbone, DragonCommChannel.from_descriptor
         )
         return broadcaster
@@ -138,30 +161,21 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None:
         worker queue
 
         :raises: SmartSimError if unable to attach to a backbone featurestore"""
-        # comm = MPI.COMM_WORLD
-        # rank = comm.Get_rank()
-        rank: int = 0
-        self._timeout = wait_timeout or self._DEFAULT_TIMEOUT
+        # todo: determine a way to make this work in tests.
+        #  - consider catching the import exception and defaulting rank to 0
+        if MPI is not None:
+            comm = MPI.COMM_WORLD
+            rank = comm.Get_rank()
+        else:
+            rank: int = 0
+
+        self._backbone_timeout = wait_timeout
 
         connect_to_infrastructure()
-        # ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"]
-        # self._ddict = DDict.attach(ddict_str)
-        # self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor
-        self._backbone = self._attach_to_backbone(wait_timeout=wait_timeout)
-
-        # # to_worker_fli_str = None
-        # # while to_worker_fli_str is None:
-        # #     try:
-        # #         to_worker_fli_str = self._ddict["to_worker_fli"]
-        # #         self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str)
-        # #     except KeyError:
-        # #         time.sleep(1)
 
+        self._backbone = self._attach_to_backbone(wait_timeout=self.backbone_timeout)
         self._to_worker_fli = self._attach_to_worker_queue()
 
-        # # # self._from_worker_ch = Channel.make_process_local()
-        # # # self._from_worker_ch_serialized = self._from_worker_ch.serialize()
-        # # # self._to_worker_ch = Channel.make_process_local()
         channels = self._create_worker_channels()
         self._from_worker_ch = channels[0]
         self._to_worker_ch = channels[1]
@@ -176,6 +190,13 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None:
         self._timings: _TimingDict = OrderedDict()
         self._timing_on = timing_on
 
+    @property
+    def backbone_timeout(self) -> float:
+        """The timeout (in seconds) applied to retrievals from the backbone feature store.
+
+        :returns: A float indicating the number of seconds to allow"""
+        return self._backbone_timeout or self._DEFAULT_BACKBONE_TIMEOUT
+
     def _add_label_to_timings(self, label: str) -> None:
         if label not in self._timings:
             self._timings[label] = []
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index 3eb800bb7..01b280d08 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -1,231 +1,304 @@
-# # BSD 2-Clause License
-# #
-# # Copyright (c) 2021-2024, Hewlett Packard Enterprise
-# # All rights reserved.
-# #
-# # Redistribution and use in source and binary forms, with or without
-# # modification, are permitted provided that the following conditions are met:
-# #
-# # 1. Redistributions of source code must retain the above copyright notice, this
-# #    list of conditions and the following disclaimer.
-# #
-# # 2. Redistributions in binary form must reproduce the above copyright notice,
-# #    this list of conditions and the following disclaimer in the documentation
-# #    and/or other materials provided with the distribution.
-# #
-# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# import pickle
-# import time
-# import typing as t
-
-# import pytest
-
-# dragon = pytest.importorskip("dragon")
-
-# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local
-# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
-#     BackboneFeatureStore,
-#     EventBroadcaster,
-#     OnWriteFeatureStore,
-# )
-# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict
-# from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys
-# from smartsim.error.errors import SmartSimError
-# from smartsim.log import get_logger
-
-# # isort: off
-# from dragon import fli
-# from dragon.channels import Channel
-
-# # from ..ex..high_throughput_inference.mock_app import ProtoClient
-# from smartsim.protoclient import ProtoClient
-
-
-# # The tests in this file belong to the dragon group
-# pytestmark = pytest.mark.dragon
-# WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
-# logger = get_logger(__name__)
-
-
-# @pytest.fixture
-# def storage_for_dragon_fs() -> t.Dict[str, str]:
-#     # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
-#     return dragon_ddict.DDict(1, 2, 4 * 1024**2)
-
-
-# @pytest.fixture
-# def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
-#     return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
-
-
-# @pytest.fixture
-# def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
-#     """a stand-in for the worker manager so a worker queue exists"""
-
-#     # create the FLI
-#     to_worker_channel = Channel.make_process_local()
-#     # to_worker_channel = create_local()
-#     fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-#     comm_channel = DragonFLIChannel(fli_, True)
-
-#     # store the descriptor in the backbone
-#     # the_backbone.worker_queue = comm_channel.descriptor
-#     the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor
-
-#     try:
-#         comm_channel.send(b"foo")
-#     except Exception as ex:
-#         print(f"ohnooooo: {ex}")
-
-#     return comm_channel
-
-
-# @pytest.fixture
-# def storage_for_dragon_fs_with_req_queue(
-#     storage_for_dragon_fs: t.Dict[str, str]
-# ) -> t.Dict[str, str]:
-#     # create a valid FLI so any call to attach does not fail
-#     channel_ = Channel.make_process_local()
-#     fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
-#     comm_channel = DragonFLIChannel(fli_, True)
-
-#     storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor
-#     return storage_for_dragon_fs
-
-
-# @pytest.mark.parametrize(
-#     "wait_timeout, exp_wait_max",
-#     [
-#         # aggregate the 1+1+1 into 3 on remaining parameters
-#         pytest.param(0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps"),
-#         pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"),
-#         pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"),
-#     ],
-# )
-# def test_protoclient_timeout(
-#     wait_timeout: float,
-#     exp_wait_max: float,
-#     the_backbone: BackboneFeatureStore,
-#     monkeypatch: pytest.MonkeyPatch,
-# ):
-#     """Verify that attempts to attach to the worker queue from the protoclient
-#     timeout in an appropriate amount of time. Note: due to the backoff, we verify
-#     the elapsed time is less than the 15s of a cycle of waits
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import pickle
+import time
+import typing as t
+from unittest.mock import MagicMock
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+    EventBroadcaster,
+    OnWriteFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict
+from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+# isort: off
+from dragon import fli
+from dragon.channels import Channel
+
+# from ..ex..high_throughput_inference.mock_app import ProtoClient
+from smartsim.protoclient import ProtoClient
+
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
+logger = get_logger(__name__)
+
+
+@pytest.fixture
+def storage_for_dragon_fs() -> t.Dict[str, str]:
+    # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
+    return dragon_ddict.DDict(1, 2, 4 * 1024**2)
+
+
+@pytest.fixture
+def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
+    return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
+
+
+@pytest.fixture
+def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
+    """a stand-in for the worker manager so a worker queue exists"""
+
+    # create the FLI
+    to_worker_channel = create_local()
+    fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    comm_channel = DragonFLIChannel(fli_, True)
+
+    # store the descriptor in the backbone
+    the_backbone.worker_queue = comm_channel.descriptor
+    # the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor
+
+    try:
+        comm_channel.send(b"foo")
+    except Exception as ex:
+        print(f"ohnooooo: {ex}")
+
+    return comm_channel
+
+
+@pytest.fixture
+def storage_for_dragon_fs_with_req_queue(
+    storage_for_dragon_fs: t.Dict[str, str]
+) -> t.Dict[str, str]:
+    # create a valid FLI so any call to attach does not fail
+    channel_ = Channel.make_process_local()
+    fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
+    comm_channel = DragonFLIChannel(fli_, True)
+
+    storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor
+    return storage_for_dragon_fs
+
+
+@pytest.mark.parametrize(
+    "wait_timeout, exp_wait_max",
+    [
+        # aggregate the 1+1+1 into 3 on remaining parameters
+        pytest.param(
+            0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps", marks=pytest.mark.skip
+        ),
+        pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps", marks=pytest.mark.skip),
+        pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps", marks=pytest.mark.skip),
+    ],
+)
+def test_protoclient_timeout(
+    wait_timeout: float,
+    exp_wait_max: float,
+    the_backbone: BackboneFeatureStore,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Verify that attempts to attach to the worker queue from the protoclient
+    timeout in an appropriate amount of time. Note: due to the backoff, we verify
+    the elapsed time is less than the 15s of a cycle of waits
 
-#     :param wait_timeout: a timeout for use when configuring a proto client
-#     :param exp_wait_max: a ceiling for the expected time spent waiting for
-#     the timeout
-#     :param the_backbone: a pre-initialized backbone featurestore for setting up
-#     the environment variable required by the client"""
+    :param wait_timeout: a timeout for use when configuring a proto client
+    :param exp_wait_max: a ceiling for the expected time spent waiting for
+    the timeout
+    :param the_backbone: a pre-initialized backbone featurestore for setting up
+    the environment variable required by the client"""
 
-#     # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8]
-#     # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
-#     start_time = time.time()
-#     with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex:
-#         ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
+    # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8]
+    # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
+    start_time = time.time()
+    with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex:
+        ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
 
-#         ProtoClient(False, wait_timeout=wait_timeout)
+        ProtoClient(False, wait_timeout=wait_timeout)
 
-#     end_time = time.time()
-#     elapsed = end_time - start_time
+    end_time = time.time()
+    elapsed = end_time - start_time
 
-#     # todo: revisit. should this trigger any wait if the backbone is set above?
-#     # confirm that we met our timeout
-#     # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}"
+    # todo: revisit. should this trigger any wait if the backbone is set above?
+    # confirm that we met our timeout
+    # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}"
 
-#     # confirm that the total wait time is aligned with the sleep cycle
-#     assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}"
+    # confirm that the total wait time is aligned with the sleep cycle
+    assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}"
 
 
-# def test_protoclient_initialization_no_backbone():
-#     """Verify that attempting to start the client without required environment variables
-#     results in an exception. NOTE: Backbone env var is not set"""
+def test_protoclient_initialization_no_backbone():
+    """Verify that attempting to start the client without required environment variables
+    results in an exception. NOTE: Backbone env var is not set"""
 
-#     with pytest.raises(SmartSimError) as ex:
-#         ProtoClient(timing_on=False)
+    with pytest.raises(SmartSimError) as ex:
+        ProtoClient(timing_on=False)
 
-#     # confirm the missing value error has been raised
-#     assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" ")))
+    # confirm the missing value error has been raised
+    assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" ")))
 
 
-# def test_protoclient_initialization(
-#     the_backbone: BackboneFeatureStore,
-#     the_worker_queue: DragonFLIChannel,
-#     monkeypatch: pytest.MonkeyPatch,
-# ):
-#     """Verify that attempting to start the client with required env vars results
-#     in a fully initialized client
+def test_protoclient_initialization(
+    the_backbone: BackboneFeatureStore,
+    the_worker_queue: DragonFLIChannel,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Verify that attempting to start the client with required env vars results
+    in a fully initialized client
 
-#     :param the_backbone: a pre-initialized backbone featurestore
-#     :param the_worker_queue: an FLI channel the client will retrieve
-#     from the backbone"""
+    :param the_backbone: a pre-initialized backbone featurestore
+    :param the_worker_queue: an FLI channel the client will retrieve
+    from the backbone"""
 
-#     with monkeypatch.context() as ctx:
-#         ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
-#         # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture
+    with monkeypatch.context() as ctx:
+        ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
+        # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone
 
-#         client = ProtoClient(timing_on=False)
+        client = ProtoClient(timing_on=False)
 
-#         # confirm the backbone was attached correctly
-#         assert client._backbone is not None
-#         assert client._backbone.descriptor == the_backbone.descriptor
+        fs_descriptor = the_backbone.descriptor
+        wq_descriptor = the_worker_queue.descriptor
 
-#         # confirm the worker queue is created and attached correctly
-#         assert client._to_worker_fli is not None
-#         assert client._to_worker_fli.descriptor == the_worker_queue.descriptor
+        # confirm the backbone was attached correctly
+        assert client._backbone is not None
+        assert client._backbone.descriptor == fs_descriptor
 
-#         # confirm the worker channels are created
-#         assert client._from_worker_ch is not None
-#         assert client._from_worker_ch.descriptor
+        # we expect the backbone to add its descriptor to the local env
+        assert os.environ[BackboneFeatureStore.MLI_BACKBONE] == fs_descriptor
 
-#         assert client._to_worker_ch is not None
-#         assert client._to_worker_ch.descriptor
+        # confirm the worker queue is created and attached correctly
+        assert client._to_worker_fli is not None
+        assert client._to_worker_fli.descriptor == wq_descriptor
+
+        # we expect the worker queue descriptor to be placed into the backbone
+        # we do NOT expect _from_worker_ch to be placed anywhere. it's a specific callback
+        assert the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] == wq_descriptor
 
-#         # confirm a publisher is created
-#         assert client._publisher is not None
+        # confirm the worker channels are created
+        assert client._from_worker_ch is not None
+        assert client._to_worker_ch is not None
+
+        # wrap the channels just to easily verify they produces a descriptor
+        assert DragonCommChannel(client._from_worker_ch).descriptor
+        assert DragonCommChannel(client._to_worker_ch).descriptor
+
+        # confirm a publisher is created
+        assert client._publisher is not None
+
+
+def test_protoclient_write_model(
+    the_backbone: BackboneFeatureStore,
+    the_worker_queue: DragonFLIChannel,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Verify that writing a model using the client causes the model data to be
+    written to a feature store
 
+    :param the_backbone: a pre-initialized backbone featurestore
+    :param the_worker_queue: an FLI channel the client will retrieve
+    from the backbone"""
 
-# def test_protoclient_write_model(
-#     the_backbone: BackboneFeatureStore,
-#     the_worker_queue: DragonFLIChannel,
-#     monkeypatch: pytest.MonkeyPatch,
-# ):
-#     """Verify that writing a model using the client causes the model data to be
-#     written to a feature store and triggers a key-written event
+    with monkeypatch.context() as ctx:
+        # we won't actually send here
+        client = ProtoClient(timing_on=False)
 
-#     :param the_backbone: a pre-initialized backbone featurestore
-#     :param the_worker_queue: an FLI channel the client will retrieve
-#     from the backbone"""
+        ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
+        # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone
 
-#     with monkeypatch.context() as ctx:
-#         ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor)
-#         # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture
+        client = ProtoClient(timing_on=False)
 
-#         client = ProtoClient(timing_on=False)
+        model_key = "my-model"
+        model_bytes = b"12345"
 
-#         model_key = "my-model"
-#         model_bytes = b"12345"
+        client.set_model(model_key, model_bytes)
 
-#         client.set_model(model_key, model_bytes)
+        # confirm the client modified the underlying feature store
+        assert client._backbone[model_key] == model_bytes
 
-#         # confirm the client modified the underlying feature store
-#         assert client._backbone[model_key] == model_bytes
 
-#         publisher = t.cast(EventBroadcaster, client._publisher)
+@pytest.mark.parametrize("num_listeners", [1, 2, 4])
+def test_protoclient_write_model_notification_sent(
+    the_backbone: BackboneFeatureStore,
+    the_worker_queue: DragonFLIChannel,
+    monkeypatch: pytest.MonkeyPatch,
+    num_listeners: int,
+):
+    """Verify that writing a model sends a key-written event
 
-#         # confirm the client raised the key-written event
-#         assert len(publisher._event_buffer) == 1
+    :param the_backbone: a pre-initialized backbone featurestore
+    :param the_worker_queue: an FLI channel the client will retrieve
+    from the backbone
+    :param num_listeners: vary the number of registered listeners
+    to verify that the event is broadcast to everyone
+    """
 
-#         event = t.cast(OnWriteFeatureStore, pickle.loads(publisher._event_buffer.pop()))
-#         assert event.descriptor == the_backbone.descriptor
-#         assert event.key == model_key
+    # we won't actually send here, but it won't try without registered listeners
+    listeners = [f"mock-ch-desc-{i}" for i in range(num_listeners)]
+    the_backbone[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS] = ",".join(listeners)
+
+    with monkeypatch.context() as ctx:
+        ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
+        # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone
+
+        client = ProtoClient(timing_on=False)
+
+        publisher = t.cast(EventBroadcaster, client._publisher)
+
+        # mock attaching to a channel given the mock-ch-desc in backbone
+        mock_send = MagicMock(return_value=None)
+        mock_comm_channel = MagicMock(**{"send": mock_send}, spec=DragonCommChannel)
+        mock_get_comm_channel = MagicMock(return_value=mock_comm_channel)
+        ctx.setattr(publisher, "_get_comm_channel", mock_get_comm_channel)
+
+        model_key = "my-model"
+        model_bytes = b"12345"
+
+        client.set_model(model_key, model_bytes)
+
+        # confirm that a listener channel was attached
+        # once for each registered listener in backbone
+        assert mock_get_comm_channel.call_count == num_listeners
+
+        # confirm the client raised the key-written event
+        assert (
+            mock_send.call_count == num_listeners
+        ), f"Expected {num_listeners} sends with {num_listeners} registrations"
+
+        # with at least 1 consumer registered, we can verify the message is sent
+        for call_args in mock_send.call_args_list:
+            send_args = call_args.args
+            event_bytes, timeout = send_args[0], send_args[1]
+
+            assert event_bytes, "Expected event bytes to be supplied to send"
+            assert (
+                timeout == 0.001
+            ), "Expected default timeout on call to `publisher.send`, "
+
+            # confirm the correct event was raised
+            event = t.cast(OnWriteFeatureStore, pickle.loads(event_bytes))
+            assert event.descriptor == the_backbone.descriptor
+            assert event.key == model_key

From 6f1cba79e18b8339fa3ddcd657a6187bba6128d7 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 24 Sep 2024 17:37:52 -0500
Subject: [PATCH 07/40] reduce timeouts & backoffs, share backbone across
 protoclient tests

---
 .../_core/launcher/dragon/dragonBackend.py    |  5 ++--
 smartsim/_core/mli/comm/channel/channel.py    |  2 +-
 .../_core/mli/comm/channel/dragon_channel.py  |  2 +-
 smartsim/_core/mli/comm/channel/dragon_fli.py | 17 +++++------
 .../storage/backbone_feature_store.py         | 26 +++++++++++------
 smartsim/protoclient.py                       | 25 +++++++++--------
 tests/dragon/channel.py                       |  7 +++--
 tests/dragon/test_featurestore_base.py        |  2 +-
 tests/dragon/test_protoclient.py              | 28 ++++++++++++-------
 tests/dragon/utils/channel.py                 |  9 +++---
 tests/mli/channel.py                          |  3 +-
 11 files changed, 74 insertions(+), 52 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index a1367af2a..577b95119 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -622,9 +622,8 @@ def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer:
                 name="BackendConsumerRegistrar",
                 event_handler=self._on_consumer_created,
             )
-            while consumer.register():
-                # wait for the consumer to complete registration
-                ...
+            consumer.register()
+            logger.info(f"Consumer `{consumer.name}` registration completed.")
 
             # self._backbone.backend_channel =
             # consumer.descriptor # i want to get rid of this extra channel
diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py
index 90d81cb9b..bfa1c50fb 100644
--- a/smartsim/_core/mli/comm/channel/channel.py
+++ b/smartsim/_core/mli/comm/channel/channel.py
@@ -55,8 +55,8 @@ def __init__(
     def send(self, value: bytes, timeout: float = 0) -> None:
         """Send a message through the underlying communication channel.
 
-        :param timeout: Maximum time to wait (in seconds) for messages to send
         :param value: The value to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
         :raises SmartSimError: If sending message fails
         """
 
diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py
index 4f8d3e552..0b73080d6 100644
--- a/smartsim/_core/mli/comm/channel/dragon_channel.py
+++ b/smartsim/_core/mli/comm/channel/dragon_channel.py
@@ -151,7 +151,7 @@ def send(self, value: bytes, timeout: float = 0.001) -> None:
                 logger.debug(f"DragonCommChannel {self.descriptor} sent message")
         except Exception as e:
             raise SmartSimError(
-                f"Error sending message: DragonCommChannel {self.descriptor!r}"
+                f"Error sending via DragonCommChannel {self.descriptor}"
             ) from e
 
     def recv(self, timeout: float = 0.001) -> t.List[bytes]:
diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py
index 325f6b779..22593f63c 100644
--- a/smartsim/_core/mli/comm/channel/dragon_fli.py
+++ b/smartsim/_core/mli/comm/channel/dragon_fli.py
@@ -68,20 +68,23 @@ def __init__(
             create_local(buffer_size) if sender_supplied else None
         )
 
-    def send(self, value: bytes, timeout: float = 0.001) -> None:
+    def send(
+        self, value: bytes, timeout: float = 0.001, blocking: bool = False
+    ) -> None:
         """Send a message through the underlying communication channel.
 
-        :param timeout: Maximum time to wait (in seconds) for messages to send
         :param value: The value to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :param blocking: Block returning until the message has been received
         :raises SmartSimError: If sending message fails
         """
         try:
             with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh:
                 sendh.send_bytes(value, timeout=timeout)
-                logger.debug(f"DragonFLIChannel {self.descriptor!r} sent message")
+                logger.debug(f"DragonFLIChannel {self.descriptor} sent message")
         except Exception as e:
             raise SmartSimError(
-                f"Error sending message: DragonFLIChannel {self.descriptor!r}"
+                f"Error sending message: DragonFLIChannel {self.descriptor}"
             ) from e
 
     def recv(self, timeout: float = 0.001) -> t.List[bytes]:
@@ -98,14 +101,12 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]:
                 try:
                     message, _ = recvh.recv_bytes(timeout=timeout)
                     messages.append(message)
-                    logger.debug(
-                        f"DragonFLIChannel {self.descriptor!r} received message"
-                    )
+                    logger.debug(f"DragonFLIChannel {self.descriptor} received message")
                 except fli.FLIEOT:
                     eot = True
                 except Exception as e:
                     raise SmartSimError(
-                        f"Error receiving messages: DragonFLIChannel {self.descriptor!r}"
+                        f"Error receiving messages: DragonFLIChannel {self.descriptor}"
                     ) from e
         return messages
 
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index e48f4e4e9..1110dc812 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -71,7 +71,7 @@ class BackboneFeatureStore(DragonFeatureStore):
     MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE"
     MLI_BACKBONE = "_SMARTSIM_INFRA_BACKBONE"
     _CREATED_ON = "creation"
-    _DEFAULT_WAIT_TIMEOUT = 30.0
+    _DEFAULT_WAIT_TIMEOUT = 1.0
 
     def __init__(
         self,
@@ -219,7 +219,7 @@ def wait_for(
         values: t.Dict[str, t.Union[str, bytes, None]] = {k: None for k in set(keys)}
         is_found = {k: False for k in values.keys()}
 
-        backoff: t.List[float] = [0.1, 0.5, 1, 2, 4]
+        backoff = (0.1, 0.2, 0.4, 0.8)
         backoff_iter = itertools.cycle(backoff)
         start_time = time.time()
 
@@ -360,7 +360,7 @@ def __init__(
         self._backbone = backbone
         self._channel: t.Optional[CommChannelBase] = channel
 
-    def send(self, event: EventBase) -> int:
+    def send(self, event: EventBase, timeout: float = 0.001) -> int:
         """The send operation"""
         if self._channel is None:
             # self._channel = self._channel_factory(event)
@@ -371,7 +371,7 @@ def send(self, event: EventBase) -> int:
 
         try:
             event_bytes = bytes(event)
-            self._channel.send(event_bytes)
+            self._channel.send(event_bytes, timeout)
             num_sent += 1
         except Exception as ex:
             raise SmartSimError(f"Failed broadcast to channel: {self._channel}") from ex
@@ -589,6 +589,17 @@ def descriptor(self) -> str:
         :returns: The comm channel descriptor"""
         return self._comm_channel.descriptor
 
+    @property
+    def name(self) -> str:
+        """The friendly name assigned to the consumer.
+
+        :returns: The consumer name if one is assigned, othewise a unique
+        id assigned by the system.
+        """
+        if self._name is None:
+            self._name = str(uuid.uuid4())
+        return self._name
+
     def receive(
         self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0
     ) -> t.List[EventBase]:
@@ -635,11 +646,11 @@ def receive(
 
         return messages
 
-    def register(self) -> t.Generator[bool, None, None]:
+    def register(self) -> None:
         """Send an event to register this consumer as a listener"""
         awaiting_confirmation = True
         descriptor = self._comm_channel.descriptor
-        backoffs = itertools.cycle((0.1, 0.5, 1.0, 2.0, 4.0))
+        backoffs = itertools.cycle((0.1, 0.2, 0.4, 0.8))
         event = OnCreateConsumer(descriptor, self._global_filters)
 
         # create a temporary publisher to broadcast my own existence.
@@ -654,7 +665,6 @@ def register(self) -> t.Generator[bool, None, None]:
             if descriptor in registered_channels:
                 awaiting_confirmation = False
 
-            yield not awaiting_confirmation
             time.sleep(next(backoffs))
 
             # if backend_descriptor := self._backbone.backend_channel:
@@ -665,7 +675,7 @@ def register(self) -> t.Generator[bool, None, None]:
 
             # broadcast that this consumer is now ready to mingle
             publisher = EventBroadcaster(self._backbone, DragonCommChannel.from_local)
-            publisher.send(event, timeout=0.1)
+            publisher.send(event, timeout=0.01)
 
     # def register_callback(self, callback: t.Callable[[EventBase], None]) -> None: ...
 
diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py
index c2b7ebaf0..3e786cf05 100644
--- a/smartsim/protoclient.py
+++ b/smartsim/protoclient.py
@@ -31,6 +31,12 @@
 import dragon.channels
 from dragon.globalservices.api_setup import connect_to_infrastructure
 
+try:
+    from mpi4py import MPI  # type: ignore[import-not-found]
+except Exception:
+    MPI = None
+    print("Unable to import `mpi4py` package")
+
 # isort: on
 # pylint: enable=unused-import,import-error
 
@@ -44,8 +50,8 @@
 import torch
 
 from smartsim._core.mli.comm.channel.dragon_channel import (
-    create_local,
     DragonCommChannel,
+    create_local,
 )
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
@@ -59,13 +65,6 @@
 from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
 
-
-try:
-    from mpi4py import MPI
-except Exception:
-    MPI = None
-    print("Unable to import `mpi4py` package")
-
 _TimingDict = OrderedDict[str, list[str]]
 
 
@@ -134,7 +133,8 @@ def _create_worker_channels(
     ) -> t.Tuple[dragon.channels.Channel, dragon.channels.Channel]:
         """Create channels to be used for communication to and from the worker queue.
 
-        :returns: A tuple containing the native from and to Channels as (from_channel, to_channel).
+        :returns: A tuple containing the native from and to
+        Channels as (from_channel, to_channel).
         """
 
         _from_worker_ch_raw = create_local(cls._DEFAULT_WORK_QUEUE_SIZE)
@@ -165,9 +165,9 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None:
         #  - consider catching the import exception and defaulting rank to 0
         if MPI is not None:
             comm = MPI.COMM_WORLD
-            rank = comm.Get_rank()
+            rank: int = comm.Get_rank()
         else:
-            rank: int = 0
+            rank = 0
 
         self._backbone_timeout = wait_timeout
 
@@ -192,7 +192,8 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None:
 
     @property
     def backbone_timeout(self) -> float:
-        """The timeout (in seconds) applied to retrievals from the backbone feature store.
+        """The timeout (in seconds) applied to retrievals
+        from the backbone feature store.
 
         :returns: A float indicating the number of seconds to allow"""
         return self._backbone_timeout or self._DEFAULT_BACKBONE_TIMEOUT
diff --git a/tests/dragon/channel.py b/tests/dragon/channel.py
index b00ba9aa2..efabb00c0 100644
--- a/tests/dragon/channel.py
+++ b/tests/dragon/channel.py
@@ -54,10 +54,11 @@ def __init__(self, key: pathlib.Path) -> None:
         self._file_path.touch()
 
     def send(self, value: bytes, timeout: float = 0) -> None:
-        """Send a message throuh the underlying communication channel
+        """Send a message throuh the underlying communication channel.
 
-        :param timeout: maximum time to wait (in seconds) for messages to send
-        :param value: The value to send"""
+        :param value: The value to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        """
         with self._lock:
             # write as text so we can add newlines as delimiters
             with open(self._file_path, "a") as fp:
diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py
index 94733afc7..87536c5ba 100644
--- a/tests/dragon/test_featurestore_base.py
+++ b/tests/dragon/test_featurestore_base.py
@@ -759,7 +759,7 @@ def test_backbone_wait_timeout(wait_timeout: float, exp_wait_max: float) -> None
     :param storage_for_dragon_fs: the dragon storage engine to use
     """
 
-    # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8]
+    # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8]
     # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
     start_time = time.time()
 
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index 01b280d08..f5a55a381 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -60,18 +60,18 @@
 logger = get_logger(__name__)
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def storage_for_dragon_fs() -> t.Dict[str, str]:
     # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
     return dragon_ddict.DDict(1, 2, 4 * 1024**2)
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
     return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
     """a stand-in for the worker manager so a worker queue exists"""
 
@@ -82,12 +82,11 @@ def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
 
     # store the descriptor in the backbone
     the_backbone.worker_queue = comm_channel.descriptor
-    # the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor
 
     try:
         comm_channel.send(b"foo")
     except Exception as ex:
-        print(f"ohnooooo: {ex}")
+        logger.exception(f"Test send from worker channel failed", exc_info=True)
 
     return comm_channel
 
@@ -132,7 +131,7 @@ def test_protoclient_timeout(
     :param the_backbone: a pre-initialized backbone featurestore for setting up
     the environment variable required by the client"""
 
-    # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8]
+    # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8]
     # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
     start_time = time.time()
     with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex:
@@ -240,12 +239,16 @@ def test_protoclient_write_model(
         assert client._backbone[model_key] == model_bytes
 
 
-@pytest.mark.parametrize("num_listeners", [1, 2, 4])
+@pytest.mark.parametrize(
+    "num_listeners, num_model_updates",
+    [(1, 1), (1, 4), (2, 4), (16, 4), (64, 8)],
+)
 def test_protoclient_write_model_notification_sent(
     the_backbone: BackboneFeatureStore,
     the_worker_queue: DragonFLIChannel,
     monkeypatch: pytest.MonkeyPatch,
     num_listeners: int,
+    num_model_updates: int,
 ):
     """Verify that writing a model sends a key-written event
 
@@ -258,7 +261,11 @@ def test_protoclient_write_model_notification_sent(
 
     # we won't actually send here, but it won't try without registered listeners
     listeners = [f"mock-ch-desc-{i}" for i in range(num_listeners)]
+
+    the_backbone[BackboneFeatureStore.MLI_BACKBONE] = the_backbone.descriptor
+    the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_queue.descriptor
     the_backbone[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS] = ",".join(listeners)
+    the_backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = None
 
     with monkeypatch.context() as ctx:
         ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
@@ -277,15 +284,16 @@ def test_protoclient_write_model_notification_sent(
         model_key = "my-model"
         model_bytes = b"12345"
 
-        client.set_model(model_key, model_bytes)
+        for i in range(num_model_updates):
+            client.set_model(model_key, model_bytes)
 
         # confirm that a listener channel was attached
         # once for each registered listener in backbone
-        assert mock_get_comm_channel.call_count == num_listeners
+        assert mock_get_comm_channel.call_count == num_listeners * num_model_updates
 
         # confirm the client raised the key-written event
         assert (
-            mock_send.call_count == num_listeners
+            mock_send.call_count == num_listeners * num_model_updates
         ), f"Expected {num_listeners} sends with {num_listeners} registrations"
 
         # with at least 1 consumer registered, we can verify the message is sent
diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py
index b00ba9aa2..003d79400 100644
--- a/tests/dragon/utils/channel.py
+++ b/tests/dragon/utils/channel.py
@@ -40,7 +40,7 @@ class FileSystemCommChannel(CommChannelBase):
     """Passes messages by writing to a file"""
 
     def __init__(self, key: pathlib.Path) -> None:
-        """Initialize the FileSystemCommChannel instance
+        """Initialize the FileSystemCommChannel instance.
 
         :param key: a path to the root directory of the feature store"""
         self._lock = threading.RLock()
@@ -54,10 +54,11 @@ def __init__(self, key: pathlib.Path) -> None:
         self._file_path.touch()
 
     def send(self, value: bytes, timeout: float = 0) -> None:
-        """Send a message throuh the underlying communication channel
+        """Send a message throuh the underlying communication channel.
 
-        :param timeout: maximum time to wait (in seconds) for messages to send
-        :param value: The value to send"""
+        :param value: The value to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        """
         with self._lock:
             # write as text so we can add newlines as delimiters
             with open(self._file_path, "a") as fp:
diff --git a/tests/mli/channel.py b/tests/mli/channel.py
index b00ba9aa2..1bbf159b1 100644
--- a/tests/mli/channel.py
+++ b/tests/mli/channel.py
@@ -56,8 +56,9 @@ def __init__(self, key: pathlib.Path) -> None:
     def send(self, value: bytes, timeout: float = 0) -> None:
         """Send a message throuh the underlying communication channel
 
+        :param value: The value to send
         :param timeout: maximum time to wait (in seconds) for messages to send
-        :param value: The value to send"""
+        """
         with self._lock:
             # write as text so we can add newlines as delimiters
             with open(self._file_path, "a") as fp:

From c0a1bca6108de64d4b5f3c19714a61decfb8bdf7 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 24 Sep 2024 21:38:03 -0500
Subject: [PATCH 08/40] Test eventing end-to-end in single process

---
 .../_core/launcher/dragon/dragonBackend.py    |  47 +--
 smartsim/_core/mli/comm/channel/channel.py    |   4 +-
 .../_core/mli/comm/channel/dragon_channel.py  |   2 +-
 .../storage/backbone_feature_store.py         |  75 ++--
 tests/dragon/test_dragon_backend.py           | 322 ++++++++++--------
 tests/dragon/test_featurestore.py             |   6 +-
 tests/dragon/test_featurestore_base.py        |  12 +-
 tests/dragon/test_featurestore_integration.py |   8 +-
 8 files changed, 259 insertions(+), 217 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 577b95119..3fe120a9d 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -27,6 +27,7 @@
 import functools
 import itertools
 import multiprocessing as mp
+import os
 import time
 import typing as t
 from dataclasses import dataclass, field
@@ -48,13 +49,17 @@
 import dragon.native.machine as dragon_machine
 
 from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter
-from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_channel import (
+    DragonCommChannel,
+    create_local,
+)
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
     EventBase,
     # EventBroadcaster,
     EventCategory,
     EventConsumer,
+    OnCreateConsumer,
 )
 
 # pylint: enable=import-error
@@ -572,12 +577,21 @@ def _create_backbone(self) -> BackboneFeatureStore:
             self._backbone = BackboneFeatureStore(
                 backbone_storage, allow_reserved_writes=True
             )
+
+            # put the backbone descriptor in the env vars
+            os.environ.update(self._backbone.get_env())
             logger.info(self._backbone.creation_date)
 
         return self._backbone
 
     def _on_consumer_created(self, event: EventBase) -> None:
         """Event handler for"""
+        if isinstance(event, OnCreateConsumer) and self._backbone is not None:
+            notify_list = set(self._backbone.notification_channels)
+            notify_list.add(event.descriptor)
+            self._backbone.notification_channels = list(notify_list)
+            return
+
         logger.warning(f"Unhandled event received: {event}")
 
     def _bootstrap_event_listeners(
@@ -591,7 +605,7 @@ def _bootstrap_event_listeners(
 
         # Update directly to avoid SEND/ACK pattern
         notify_descriptors.append(consumer.descriptor)
-        # consumer.register() # this will loop infinitely waiting for itself
+        notify_descriptors = list(set(notify_descriptors))
 
         backbone.notification_channels = notify_descriptors
 
@@ -605,16 +619,11 @@ def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer:
         attempting to connect any eventing clients.
         :returns: The newly created EventConsumer instance
         """
-        # if self._event_producer is None:
-        #     logger.info("Creating event publisher")
-        #     # todo: ensure DCC.from_descriptor and not DCC.from_local
-        #     self._event_producer =
-        # EventBroadcaster(backbone, DragonCommChannel.from_descriptor)
-        #     logger.info("Created event publisher")
 
         if self._event_consumer is None:
             logger.info("Creating event consumer")
-            event_channel = DragonCommChannel.from_local()
+            dragon_channel = create_local(500)
+            event_channel = DragonCommChannel(dragon_channel)
             consumer = EventConsumer(
                 event_channel,
                 backbone,
@@ -622,24 +631,20 @@ def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer:
                 name="BackendConsumerRegistrar",
                 event_handler=self._on_consumer_created,
             )
-            consumer.register()
-            logger.info(f"Consumer `{consumer.name}` registration completed.")
 
-            # self._backbone.backend_channel =
-            # consumer.descriptor # i want to get rid of this extra channel
-            # self._bootstrap_event_listeners(backbone, consumer)
             self._event_consumer = consumer
-
-            logger.info("Created event consumer")
+            backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = consumer.descriptor
+            logger.info(f"Backend consumer `{consumer.name}` created.")
 
         return self._event_consumer
 
+    def listen_to_registrations(self, timeout: float = 0.001) -> None:
+        if self._event_consumer is not None:
+            self._event_consumer.listen_once(timeout)
+
     def _start_eventing_listeners(self) -> None:
-        if self._event_consumer:
-            self._event_consumer_process = mp.Process(
-                target=self._event_consumer.listen
-            )
-            self._event_consumer_process.start()
+        # todo: start external listener entrypoint
+        ...
 
     @staticmethod
     def create_run_policy(
diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py
index bfa1c50fb..a581e8e2a 100644
--- a/smartsim/_core/mli/comm/channel/channel.py
+++ b/smartsim/_core/mli/comm/channel/channel.py
@@ -52,7 +52,7 @@ def __init__(
         """A user-friendly identifier for channel-related logging"""
 
     @abstractmethod
-    def send(self, value: bytes, timeout: float = 0) -> None:
+    def send(self, value: bytes, timeout: float = 0.001) -> None:
         """Send a message through the underlying communication channel.
 
         :param value: The value to send
@@ -61,7 +61,7 @@ def send(self, value: bytes, timeout: float = 0) -> None:
         """
 
     @abstractmethod
-    def recv(self, timeout: float = 0) -> t.List[bytes]:
+    def recv(self, timeout: float = 0.001) -> t.List[bytes]:
         """Receives message(s) through the underlying communication channel.
 
         :param timeout: Maximum time to wait (in seconds) for messages to arrive
diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py
index 0b73080d6..9c0ac3423 100644
--- a/smartsim/_core/mli/comm/channel/dragon_channel.py
+++ b/smartsim/_core/mli/comm/channel/dragon_channel.py
@@ -147,7 +147,7 @@ def send(self, value: bytes, timeout: float = 0.001) -> None:
         """
         try:
             with self._channel.sendh(timeout=timeout) as sendh:
-                sendh.send_bytes(value)
+                sendh.send_bytes(value, blocking=False)
                 logger.debug(f"DragonCommChannel {self.descriptor} sent message")
         except Exception as e:
             raise SmartSimError(
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index 1110dc812..83c255fe7 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -122,8 +122,8 @@ def backend_channel(self) -> t.Optional[str]:
         """Retrieve the channel descriptor exposed by the MLI backend for events
 
         :returns: a stringified channel descriptor"""
-        if self.MLI_NOTIFY_CONSUMERS in self:
-            return str(self[self.MLI_NOTIFY_CONSUMERS])
+        if self.MLI_BACKEND_CONSUMER in self:
+            return str(self[self.MLI_BACKEND_CONSUMER])
         return None
 
     @backend_channel.setter
@@ -131,7 +131,7 @@ def backend_channel(self, value: str) -> None:
         """Set the channel exposed by the MLI backend for events
 
         :param value: a stringified channel descriptor"""
-        self[self.MLI_NOTIFY_CONSUMERS] = value
+        self[self.MLI_BACKEND_CONSUMER] = value
 
     @property
     def worker_queue(self) -> t.Optional[str]:
@@ -165,8 +165,7 @@ def _record_creation_data(self) -> None:
                 )
             self[self._CREATED_ON] = str(time.time())
 
-        if os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) is None:
-            os.environ.update(self.get_env())
+        os.environ[self.MLI_BACKBONE] = self.descriptor
 
     @classmethod
     def from_writable_descriptor(
@@ -479,7 +478,7 @@ def _get_comm_channel(self, descriptor: str) -> CommChannelBase:
             logger.error(msg, exc_info=True)
             raise SmartSimError(msg) from ex
 
-    def _get_next_event_event(self) -> t.Optional[EventBase]:
+    def _get_next_event(self) -> t.Optional[EventBase]:
         """Pop the next event to be sent from the queue.
 
         :returns: The next event to send if any events are enqueued, otherwise `None`.
@@ -512,7 +511,7 @@ def _broadcast(self, timeout: float = 0.001) -> int:
         num_listeners = len(self._descriptors)
 
         # send each event to every consumer
-        while event := self._get_next_event_event():
+        while event := self._get_next_event():
             logger.debug(f"Broadcasting {event=} to {num_listeners} listeners")
             event_bytes = bytes(event)
 
@@ -524,7 +523,7 @@ def _broadcast(self, timeout: float = 0.001) -> int:
                     num_sent += 1
                 except Exception as ex:
                     raise SmartSimError(
-                        f"Broadcast {i}/{num_listeners} for event {event.uid} to "
+                        f"Broadcast {i+1}/{num_listeners} for event {event.uid} to "
                         f"channel  {descriptor} from {self._uid} failed."
                     ) from ex
 
@@ -547,6 +546,7 @@ def send(self, event: EventBase, timeout: float = 0.001) -> int:
         except (KeyError, ValueError, SmartSimError):
             raise
         except Exception as ex:
+            logger.exception("An unexpected exception occurred while sending")
             raise SmartSimError("An unexpected failure occurred while sending") from ex
 
 
@@ -600,8 +600,8 @@ def name(self) -> str:
             self._name = str(uuid.uuid4())
         return self._name
 
-    def receive(
-        self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0
+    def recv(
+        self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0.001
     ) -> t.List[EventBase]:
         """Receives available published event(s).
 
@@ -648,44 +648,35 @@ def receive(
 
     def register(self) -> None:
         """Send an event to register this consumer as a listener"""
-        awaiting_confirmation = True
         descriptor = self._comm_channel.descriptor
-        backoffs = itertools.cycle((0.1, 0.2, 0.4, 0.8))
         event = OnCreateConsumer(descriptor, self._global_filters)
 
-        # create a temporary publisher to broadcast my own existence.
-        publisher = EventBroadcaster(self._backbone, DragonCommChannel.from_local)
-
-        # we're going to sit in this loop to wait for the backbone to get
-        # updated with the registration (to avoid SEND/ACK)
-        while awaiting_confirmation:
-            registered_channels = self._backbone.notification_channels
-            # todo: this should probably be descriptor_string? maybe i need to
-            # get rid of descriptor as bytes or just make desc_string required in ABC
-            if descriptor in registered_channels:
-                awaiting_confirmation = False
+        registrar_key = BackboneFeatureStore.MLI_BACKEND_CONSUMER
+        config = self._backbone.wait_for([registrar_key], 2.0)
 
-            time.sleep(next(backoffs))
+        registrar_descriptor = str(config.get(registrar_key, None))
 
-            # if backend_descriptor := self._backbone.backend_channel:
-            #     backend_channel = DragonCommChannel.
-            # from_descriptor(backend_descriptor)
-            #     backend = EventSender(self._backbone, backend_channel)
-            #     backend.send(event)
+        if registrar_descriptor:
+            logger.debug(f"Sending registration for {self.name}")
 
-            # broadcast that this consumer is now ready to mingle
-            publisher = EventBroadcaster(self._backbone, DragonCommChannel.from_local)
-            publisher.send(event, timeout=0.01)
+            registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor)
+            registrar_channel.send(bytes(event), timeout=1.0)
 
-    # def register_callback(self, callback: t.Callable[[EventBase], None]) -> None: ...
+            logger.debug(f"Registration for {self.name} sent")
+        else:
+            logger.warning("Unable to register. No registrar channel found.")
 
-    def listen(self) -> None:
+    def listen_once(self, timeout: float = 0.001) -> None:
         """Function to handle incoming events"""
-        print("starting listener...")
-
-        while True:
-            print("awaiting new message")
-            incoming_messages = self.receive()
-            for message in incoming_messages:
-                if self._event_handler:
-                    self._event_handler(message)
+        logger.debug(f"Starting event listener with {timeout} second timeout")
+        logger.debug("Awaiting new messages")
+
+        incoming_messages = self.recv(timeout=timeout)
+
+        if not incoming_messages:
+            logger.debug("Consumer received empty message list.")
+
+        for message in incoming_messages:
+            logger.debug(f"Sending event {message=} to handler.")
+            if self._event_handler:
+                self._event_handler(message)
diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py
index a4e61d430..0631e11e6 100644
--- a/tests/dragon/test_dragon_backend.py
+++ b/tests/dragon/test_dragon_backend.py
@@ -24,151 +24,197 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import os
+import typing as t
 import unittest.mock as mock
 
 import pytest
 
-# from smartsim._core.launcher.dragon.dragonBackend import DragonBackend, NodePrioritizer
-# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import EventSender, OnCreateConsumer
-
-# dragon = pytest.importorskip("dragon")
-
-# import dragon.utils as du
-# from dragon.channels import Channel
-# from dragon.data.ddict.ddict import DDict
-# from dragon.fli import DragonFLIError, FLInterface
-
-# from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
-# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
-# from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader
-# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
-#     DragonFeatureStore,
-# )
+from smartsim._core.launcher.dragon.dragonBackend import DragonBackend, NodePrioritizer
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+    EventBase,
+    EventBroadcaster,
+    EventConsumer,
+    EventSender,
+    OnCreateConsumer,
+)
+from smartsim.log import get_logger
+
+dragon = pytest.importorskip("dragon")
+
+import dragon.utils as du
+from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
+from dragon.fli import DragonFLIError, FLInterface
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+    DragonFeatureStore,
+)
 
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
+logger = get_logger(__name__)
 
 
 def test_dragonbackend_listener_boostrapping(monkeypatch: pytest.MonkeyPatch):
-    """Verify that an event listener is started"""
-    # backend_channel = DragonCommChannel.from_local()
-    assert True
-
-    # with monkeypatch.context() as patcher:
-    #     # patcher.setattr("smartsim._core.launcher.dragon.dragonBackend", "NodePrioritizer", mock.MagicMock())
-    #     patcher.setattr(NodePrioritizer, "__init__", lambda self, nodes, lock: None)
-    #     patcher.setattr(DragonBackend, "_initialize_hosts", lambda self: None)
-
-    # backend = DragonBackend(pid=9999)
-    # backend._create_backbone()
-
-    # # create the consumer and start a listener process
-    # backend_consumer = backend._create_eventing(backend._backbone)
-
-    # # ensure the consumer that was created is retained
-    # assert backend._event_consumer is not None
-    # assert backend._event_consumer == backend_consumer
-
-    # assert backend._backbone.notification_channels == [backend_consumer.descriptor]
-
-    # # create components to publish events
-    # # sender_channel = DragonCommChannel.from_local()
-    # sender = EventSender(backend._backbone, backend_channel)
-
-    # # simulate a new consumer registration
-    # new_consumer_channel = DragonCommChannel.from_local()
-    # registration = OnCreateConsumer(new_consumer_channel.descriptor)
-    # new_consumer_channel.send(bytes(registration), 0.1)
-
-    # events = backend_consumer.receive()
-    # assert len(events) == 1
-
-
-# @pytest.mark.parametrize(
-#     "content",
-#     [
-#         pytest.param(b"a"),
-#         pytest.param(b"new byte string"),
-#     ],
-# )
-# def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch):
-#     """A descriptor can be stored, loaded, and reattached"""
-#     chan = Channel.make_process_local()
-#     queue = FLInterface(main_ch=chan)
-#     monkeypatch.setenv(
-#         "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
-#     )
-
-#     config = EnvironmentConfigLoader(
-#         featurestore_factory=DragonFeatureStore.from_descriptor,
-#         callback_factory=DragonCommChannel.from_descriptor,
-#         queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
-#     )
-#     config_queue = config.get_queue()
-
-#     _ = config_queue.send(content)
-
-#     old_recv = queue.recvh()
-#     result, _ = old_recv.recv_bytes()
-#     assert result == content
-
-
-# def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
-#     """The serialized descriptors of a loaded and unloaded
-#     queue are the same"""
-#     chan = Channel.make_process_local()
-#     queue = FLInterface(main_ch=chan)
-#     monkeypatch.setenv(
-#         "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
-#     )
-
-#     config = EnvironmentConfigLoader(
-#         featurestore_factory=DragonFeatureStore.from_descriptor,
-#         callback_factory=DragonCommChannel.from_descriptor,
-#         queue_factory=DragonFLIChannel.from_descriptor,
-#     )
-#     config_queue = config.get_queue()
-#     assert config_queue._fli.serialize() == queue.serialize()
-
-
-# def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
-#     """An incorrect serialized descriptor will fails to attach"""
-#     monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring")
-#     config = EnvironmentConfigLoader(
-#         featurestore_factory=DragonFeatureStore.from_descriptor,
-#         callback_factory=None,
-#         queue_factory=DragonFLIChannel.from_descriptor,
-#     )
-
-#     with pytest.raises(DragonFLIError):
-#         config.get_queue()
-
-
-# def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
-#     """Verify the dragon feature store is loaded correctly by the
-#     EnvironmentConfigLoader to demonstrate featurestore_factory correctness"""
-#     feature_store = DragonFeatureStore(DDict())
-#     monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor)
-
-#     config = EnvironmentConfigLoader(
-#         featurestore_factory=DragonFeatureStore.from_descriptor,
-#         callback_factory=None,
-#         queue_factory=None,
-#     )
-
-#     print(f"calling config.get_backbone: `{feature_store.descriptor}`")
-
-#     backbone = config.get_backbone()
-#     assert backbone is not None
-
-
-# def test_environment_variables_not_set():
-#     """EnvironmentConfigLoader getters return None when environment
-#     variables are not set"""
-#     config = EnvironmentConfigLoader(
-#         featurestore_factory=DragonFeatureStore.from_descriptor,
-#         callback_factory=DragonCommChannel.from_descriptor,
-#         queue_factory=DragonCommChannel.from_descriptor,
-#     )
-#     assert config.get_backbone() is None
-#     assert config.get_queue() is None
+    """Verify that the dragon backend registration channel correctly
+    registers new consumers in the backbone and begins sending events
+    to the new consumers"""
+
+    backend = DragonBackend(pid=9999)
+
+    backend._create_backbone()
+    backbone = backend._backbone
+
+    def mock_event_handler(event: EventBase) -> None:
+        logger.debug(f"Handling event in mock handler: {event}")
+
+        bb_descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None)
+        assert bb_descriptor
+
+        fs = BackboneFeatureStore.from_descriptor(bb_descriptor)
+        fs[event.uid] = "received"
+
+    # create the consumer and start a listener process
+    backend_consumer = backend._create_eventing(backbone)
+    registrar_descriptor = backend._event_consumer.descriptor
+
+    # ensure the consumer is stored to backend & published to backbone
+    assert backend._event_consumer == backend_consumer
+    assert backbone.backend_channel == registrar_descriptor
+    assert os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None)
+
+    # simulate a new consumer registration
+    new_consumer_ch = DragonCommChannel.from_local()
+    new_consumer = EventConsumer(
+        new_consumer_ch,
+        backbone,
+        [],
+        name="test-consumer-a",
+        event_handler=mock_event_handler,
+    )
+    assert new_consumer, "new_consumer construction failed"
+
+    # send registration to registrar channel
+    new_consumer.register()
+
+    # the backend consumer should handle updating the notify list and the new
+    # consumer that just broadcast its registration should be registered...
+    # backend_consumer.listen_once(timeout=2.0)
+    backend.listen_to_registrations(timeout=0.1)
+
+    # # confirm the backend registrar consumer registerd the new listener
+    assert new_consumer_ch.descriptor in backbone.notification_channels
+
+    broadcaster = EventBroadcaster(backbone, DragonCommChannel.from_descriptor)
+
+    # re-send the same thing because i'm too lazy to create a new consumer
+    broadcast_event = OnCreateConsumer(registrar_descriptor, [])
+    broadcaster.send(broadcast_event, timeout=0.1)
+
+    new_consumer.listen_once(timeout=0.1)
+
+    values = backbone.wait_for(
+        [broadcast_event.uid, BackboneFeatureStore.MLI_NOTIFY_CONSUMERS], 1.0
+    )
+    stored = values[broadcast_event.uid]
+    assert stored == "received", "The handler didn't update the backbone"
+
+    # confirm that directly retrieving the value isn't different from
+    # using backbone.notification_channels helper method
+    notify_list = str(values[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS]).split(",")
+    assert new_consumer.descriptor in set(notify_list)
+
+
+@pytest.mark.parametrize(
+    "content",
+    [
+        pytest.param(b"a"),
+        pytest.param(b"new byte string"),
+    ],
+)
+def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch):
+    """A descriptor can be stored, loaded, and reattached"""
+    chan = Channel.make_process_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv(
+        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+    )
+
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+    )
+    config_queue = config.get_queue()
+
+    _ = config_queue.send(content)
+
+    old_recv = queue.recvh()
+    result, _ = old_recv.recv_bytes()
+    assert result == content
+
+
+def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
+    """The serialized descriptors of a loaded and unloaded
+    queue are the same"""
+    chan = Channel.make_process_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv(
+        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+    )
+
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    config_queue = config.get_queue()
+    assert config_queue._fli.serialize() == queue.serialize()
+
+
+def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
+    """An incorrect serialized descriptor will fails to attach"""
+    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring")
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=None,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    with pytest.raises(DragonFLIError):
+        config.get_queue()
+
+
+def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
+    """Verify the dragon feature store is loaded correctly by the
+    EnvironmentConfigLoader to demonstrate featurestore_factory correctness"""
+    feature_store = DragonFeatureStore(DDict())
+    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor)
+
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=None,
+        queue_factory=None,
+    )
+
+    print(f"calling config.get_backbone: `{feature_store.descriptor}`")
+
+    backbone = config.get_backbone()
+    assert backbone is not None
+
+
+def test_environment_variables_not_set():
+    """EnvironmentConfigLoader getters return None when environment
+    variables are not set"""
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonCommChannel.from_descriptor,
+    )
+    assert config.get_backbone() is None
+    assert config.get_queue() is None
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index 7f1649741..434bc5eab 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -181,15 +181,15 @@ def test_eventconsumer_eventpublisher_integration(
     mock_client_app.send(event_4)
 
     # worker manager should only get updates about feature update
-    wmgr_messages = wmgr_consumer.receive()
+    wmgr_messages = wmgr_consumer.recv()
     assert len(wmgr_messages) == 3
 
     # the backend should only receive messages about consumer creation
-    back_messages = back_consumer.receive()
+    back_messages = back_consumer.recv()
     assert len(back_messages) == 1
 
     # hypothetical app has no filters and will get all events
-    app_messages = capp_consumer.receive()
+    app_messages = capp_consumer.recv()
     assert len(app_messages) == 4
 
 
diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py
index 87536c5ba..59a30a3e8 100644
--- a/tests/dragon/test_featurestore_base.py
+++ b/tests/dragon/test_featurestore_base.py
@@ -561,7 +561,7 @@ def test_eventconsumer_receive(test_dir: str) -> None:
 
     consumer = EventConsumer(comm_channel, backbone)
 
-    all_received: t.List[OnCreateConsumer] = consumer.receive()
+    all_received: t.List[OnCreateConsumer] = consumer.recv()
     assert len(all_received) == 1
 
     # verify we received the same event that was raised
@@ -595,7 +595,7 @@ def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None:
 
     consumer = EventConsumer(comm_channel, backbone)
 
-    all_received: t.List[OnCreateConsumer] = consumer.receive()
+    all_received: t.List[OnCreateConsumer] = consumer.recv()
     assert len(all_received) == num_sent
 
 
@@ -621,7 +621,7 @@ def test_eventconsumer_receive_empty(test_dir: str) -> None:
 
     consumer = EventConsumer(comm_channel, backbone)
 
-    messages = consumer.receive()
+    messages = consumer.recv()
 
     # the messages array should be empty
     assert not messages
@@ -696,15 +696,15 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None:
     mock_client_app.send(event_4)
 
     # worker manager should only get updates about feature update
-    wmgr_messages = wmgr_consumer.receive()
+    wmgr_messages = wmgr_consumer.recv()
     assert len(wmgr_messages) == 3
 
     # the backend should only receive messages about consumer creation
-    back_messages = back_consumer.receive()
+    back_messages = back_consumer.recv()
     assert len(back_messages) == 1
 
     # hypothetical app has no filters and will get all events
-    app_messages = capp_consumer.receive()
+    app_messages = capp_consumer.recv()
     assert len(app_messages) == 4
 
 
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index b088df5b4..ccc63def7 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -138,15 +138,15 @@ def test_eventconsumer_eventpublisher_integration(
         mock_client_app.send(event, timeout=0.1)
 
     # worker manager should only get updates about feature update
-    wmgr_messages = wmgr_consumer.receive()
+    wmgr_messages = wmgr_consumer.recv()
     assert len(wmgr_messages) == 3
 
     # the backend should only receive messages about consumer creation
-    back_messages = back_consumer.receive()
+    back_messages = back_consumer.recv()
     assert len(back_messages) == 1
 
     # hypothetical app has no filters and will get all events
-    app_messages = capp_consumer.receive()
+    app_messages = capp_consumer.recv()
     assert len(app_messages) == 4
 
 
@@ -204,7 +204,7 @@ def test_eventconsumer_max_dequeue(
 
     num_dequeued = 0
 
-    while wmgr_messages := wmgr_consumer.receive(timeout=0.01):
+    while wmgr_messages := wmgr_consumer.recv(timeout=0.01):
         # worker manager should not get more than `max_num_msgs` events
         num_dequeued += len(wmgr_messages)
 

From d81334d37c865f96900bd7287c182b6db58aa2bb Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Wed, 25 Sep 2024 17:45:35 -0500
Subject: [PATCH 09/40] docstrings & miscellaneous minor fixes (reuse
 descriptor code, add dragon utils module, fix missing test env vars, ...

---
 conftest.py                                   |  28 +-
 ex/high_throughput_inference/mock_app.py      |  22 +-
 .../_core/launcher/dragon/dragonBackend.py    |  85 ++-
 smartsim/_core/mli/comm/channel/channel.py    |   7 +-
 .../_core/mli/comm/channel/dragon_channel.py  | 120 +---
 smartsim/_core/mli/comm/channel/dragon_fli.py |  27 +-
 .../_core/mli/comm/channel/dragon_util.py     | 137 ++++
 .../storage/backbone_feature_store.py         |  41 +-
 .../storage/dragon_feature_store.py           |   3 +-
 smartsim/log.py                               |   6 +-
 smartsim/protoclient.py                       | 105 +--
 tests/dragon/channel.py                       |  19 +-
 tests/dragon/test_dragon_backend.py           |  90 ---
 tests/dragon/test_environment_loader.py       |  22 +-
 tests/dragon/test_featurestore.py             |   6 +-
 tests/dragon/test_featurestore_integration.py |  32 +-
 tests/dragon/test_protoclient.py              |  62 +-
 tests/dragon/test_request_dispatcher.py       |   8 +-
 tests/dragon/test_worker_manager.py           | 652 +++++++++---------
 tests/dragon/utils/channel.py                 |  17 +-
 tests/mli/channel.py                          |  19 +-
 21 files changed, 743 insertions(+), 765 deletions(-)
 create mode 100644 smartsim/_core/mli/comm/channel/dragon_util.py

diff --git a/conftest.py b/conftest.py
index 622dd7a7c..098a4a0c5 100644
--- a/conftest.py
+++ b/conftest.py
@@ -459,10 +459,15 @@ def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None:
 
 @pytest.fixture(scope="function", autouse=True)
 def check_output_dir() -> None:
-    global test_output_dirs
-    assert os.path.isdir(test_output_root)
-    assert len(os.listdir(test_output_root)) >= test_output_dirs
-    test_output_dirs = len(os.listdir(test_output_root))
+    try:
+        global test_output_dirs
+        assert os.path.isdir(test_output_root)
+        assert len(os.listdir(test_output_root)) >= test_output_dirs
+        test_output_dirs = len(os.listdir(test_output_root))
+    except Exception:
+        # swallow error when the tests can't clean up test dirs
+        # and let the next run do the job.
+        ...
 
 
 @pytest.fixture
@@ -1056,8 +1061,8 @@ def as_command(self) -> t.List[str]:
         NOTE: does NOT include the `[sys.executable, msg_pump_path, ...]`
         portion of the necessary parameters to Popen.
 
-        :returns: A list of strings containing the arguments of the request
-        formatted for inclusion in a call to subprocess.Popen"""
+        :returns: The arguments of the request formatted appropriately to
+        Popen the `<project_dir>/tests/dragon/utils/msg_pump.py`"""
         return [
             "--dispatch-fli-descriptor",
             self.work_queue_descriptor,
@@ -1075,11 +1080,16 @@ def msg_pump_factory() -> t.Callable[[MsgPumpRequest], subprocess.Popen]:
     """A pytest fixture used to create a mock event producer capable of
     feeding asynchronous inference requests to tests requiring them.
 
-    :returns: A function that can be passed appropriate descriptors
-    for starting a message pump."""
+    :returns: A function that opens a subprocess running a mock message pump
+    """
 
     def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen:
-        """Invokes the message pump entry-point"""
+        """Invoke the message pump entry-point with the descriptors
+        from the request.
+
+        :param request: A request containing all parameters required to
+        invoke the message pump entrypoint
+        :returns: The Popen object for the subprocess that was started"""
         # <smartsim_dir>/tests/dragon/utils/msg_pump.py
         msg_pump_script = "tests/dragon/utils/msg_pump.py"
         msg_pump_path = pathlib.Path(__file__).parent / msg_pump_script
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 31195c7e6..2886bd5f9 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -60,7 +60,12 @@
 
 
 class ResNetWrapper:
+    """Wrapper around a pre-rained ResNet model."""
     def __init__(self, name: str, model: str):
+        """Initialize the instance.
+
+        :param name: The name to use for the model
+        :param model: The path to the pre-trained PyTorch model"""
         self._model = torch.jit.load(model)
         self._name = name
         buffer = io.BytesIO()
@@ -69,14 +74,25 @@ def __init__(self, name: str, model: str):
         self._serialized_model = buffer.getvalue()
 
     def get_batch(self, batch_size: int = 32):
+        """Create a random batch of data with the correct dimensions to
+        invoke a ResNet model.
+
+        :param batch_size: The desired number of samples to produce
+        :returns: A PyTorch tensor"""
         return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
 
     @property
-    def model(self):
+    def model(self) -> bytes:
+        """The content of a model file.
+
+        :returns: The model bytes"""
         return self._serialized_model
 
     @property
-    def name(self):
+    def name(self) -> str:
+        """The name applied to the model.
+
+        :returns: The name"""
         return self._name
 
 
@@ -90,7 +106,7 @@ def name(self):
     resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt")
 
     client = ProtoClient(timing_on=True, wait_timeout=0)
-    # client.set_model(resnet.name, resnet.model)
+    client.set_model(resnet.name, resnet.model)
 
     if CHECK_RESULTS_AND_MAKE_ALL_SLOWER:
         # TODO: adapt to non-Nvidia devices
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 3fe120a9d..6dc61516e 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -26,7 +26,6 @@
 import collections
 import functools
 import itertools
-import multiprocessing as mp
 import os
 import time
 import typing as t
@@ -36,27 +35,23 @@
 
 from tabulate import tabulate
 
-# pylint: disable=import-error,C0302,R0915,R6301
+# pylint: disable=import-error,C0302,R0915
 # isort: off
 import dragon.data.ddict.ddict as dragon_ddict
 import dragon.infrastructure.connection as dragon_connection
 import dragon.infrastructure.policy as dragon_policy
 import dragon.infrastructure.process_desc as dragon_process_desc
 
-# import dragon.native.group_state as dragon_group_state
 import dragon.native.process as dragon_process
 import dragon.native.process_group as dragon_process_group
 import dragon.native.machine as dragon_machine
 
 from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter
-from smartsim._core.mli.comm.channel.dragon_channel import (
-    DragonCommChannel,
-    create_local,
-)
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
     EventBase,
-    # EventBroadcaster,
     EventCategory,
     EventConsumer,
     OnCreateConsumer,
@@ -86,9 +81,11 @@
 logger = get_logger(__name__)
 
 
+# TODO: create ticket for follow-up task to replace defunct
+# dragon_group_state.Running() & .Error()
 class DragonStatus(str, Enum):
-    ERROR = "Error"  # str(dragon_group_state.Error())
-    RUNNING = "Running"  # str(dragon_group_state.Running())
+    ERROR = "Error"
+    RUNNING = "Running"
 
     def __str__(self) -> str:
         return self.value
@@ -195,20 +192,13 @@ def __init__(self, pid: int) -> None:
         """Whether the server frontend should shut down when the backend does"""
         self._shutdown_initiation_time: t.Optional[float] = None
         """The time at which the server initiated shutdown"""
-        smartsim_config = get_config()
-        self._cooldown_period = (
-            smartsim_config.telemetry_frequency * 2 + 5
-            if smartsim_config.telemetry_enabled
-            else 5
-        )
-        """Time in seconds needed to server to complete shutdown"""
+        self._cooldown_period = self._initialize_cooldown()
+        """Time in seconds needed by the server to complete shutdown"""
         self._backbone: t.Optional[BackboneFeatureStore] = None
         """The backbone feature store"""
         self._event_consumer: t.Optional[EventConsumer] = None
-        """A listener registered to listen for new consumers and update the shared
+        """A consumer registered to listen for new consumers and update the shared
         consumer registrations list"""
-        self._event_consumer_process: t.Optional[mp.Process] = None
-        """The process executing the event consumers `listen` method"""
 
         """An event consumer for receiving events from MLI resources"""
         self._nodes: t.List["dragon_machine.Node"] = []
@@ -223,8 +213,6 @@ def __init__(self, pid: int) -> None:
         """Mapping with hostnames as keys and a set of running step IDs as the value"""
 
         self._initialize_hosts()
-        self._view = DragonBackendView(self)
-        logger.debug(self._view.host_desc)
         self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock)
 
     @property
@@ -276,10 +264,8 @@ def status_message(self) -> str:
 
         :returns: a status message
         """
-        return (
-            "Dragon server backend update\n"
-            f"{self._view.host_table}\n{self._view.step_table}"
-        )
+        view = DragonBackendView(self)
+        return "Dragon server backend update\n" f"{view.host_table}\n{view.step_table}"
 
     def _heartbeat(self) -> None:
         """Update the value of the last heartbeat to the current time."""
@@ -580,12 +566,15 @@ def _create_backbone(self) -> BackboneFeatureStore:
 
             # put the backbone descriptor in the env vars
             os.environ.update(self._backbone.get_env())
-            logger.info(self._backbone.creation_date)
 
         return self._backbone
 
     def _on_consumer_created(self, event: EventBase) -> None:
-        """Event handler for"""
+        """Event handler for updating the backbone when new event consumers
+        are registered.
+
+        :param event: The event that was received
+        """
         if isinstance(event, OnCreateConsumer) and self._backbone is not None:
             notify_list = set(self._backbone.notification_channels)
             notify_list.add(event.descriptor)
@@ -594,29 +583,29 @@ def _on_consumer_created(self, event: EventBase) -> None:
 
         logger.warning(f"Unhandled event received: {event}")
 
-    def _bootstrap_event_listeners(
-        self, backbone: BackboneFeatureStore, consumer: EventConsumer
-    ) -> None:
-        """Update the list of notification channels registered in the backbone.
-
-        :param backbone: The backbone feature store to update"""
-        # Copy the consumer list so a backend restart doesn't clear registrations
-        notify_descriptors = list(backbone.notification_channels)
-
-        # Update directly to avoid SEND/ACK pattern
-        notify_descriptors.append(consumer.descriptor)
-        notify_descriptors = list(set(notify_descriptors))
+    @staticmethod
+    def _initialize_cooldown() -> int:
+        """Load environment configuration and determine the correct cooldown
+        period to apply to the backend process.
 
-        backbone.notification_channels = notify_descriptors
+        :returns: The calculated cooldown (in seconds)
+        """
+        smartsim_config = get_config()
+        return (
+            smartsim_config.telemetry_frequency * 2 + 5
+            if smartsim_config.telemetry_enabled
+            else 5
+        )
 
     def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer:
         """
         Create an event publisher and event consumer for communicating with
         other MLI resources.
 
-        :param backbone: The backbone feature store used by the MLI backend. NOTE:
-        passing backbone as a parameter to ensure the backbone is initialized before
-        attempting to connect any eventing clients.
+        :param backbone: The backbone feature store used by the MLI backend.
+
+        NOTE: the backbone must be initialized before connecting to eventing clients.
+
         :returns: The newly created EventConsumer instance
         """
 
@@ -639,10 +628,14 @@ def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer:
         return self._event_consumer
 
     def listen_to_registrations(self, timeout: float = 0.001) -> None:
+        """Execute the listener for registration events.
+
+        :param timeout: Maximum time to wait (in seconds) for a new event"""
         if self._event_consumer is not None:
             self._event_consumer.listen_once(timeout)
 
-    def _start_eventing_listeners(self) -> None:
+    @staticmethod
+    def _start_eventing_listeners() -> None:
         # todo: start external listener entrypoint
         ...
 
@@ -969,6 +962,8 @@ def __init__(self, backend: DragonBackend) -> None:
         self._backend = backend
         """A dragon backend used to produce the view"""
 
+        logger.debug(self.host_desc)
+
     @property
     def host_desc(self) -> str:
         hosts = self._backend.hosts
diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py
index a581e8e2a..104333ce7 100644
--- a/smartsim/_core/mli/comm/channel/channel.py
+++ b/smartsim/_core/mli/comm/channel/channel.py
@@ -76,12 +76,7 @@ def descriptor(self) -> str:
         """
         return self._descriptor
 
-    @property
-    def decoded_descriptor(self) -> bytes:
-        """Return the descriptor decoded from a string into bytes"""
-        return base64.b64decode(self._descriptor.encode("utf-8"))
-
     def __str__(self) -> str:
-        """Build a string representation of the channel useful for printing"""
+        """Build a string representation of the channel useful for printing."""
         classname = type(self).__class__.__name__
         return f"{classname}('{self._name}', '{self._descriptor}')"
diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py
index 9c0ac3423..7534719e7 100644
--- a/smartsim/_core/mli/comm/channel/dragon_channel.py
+++ b/smartsim/_core/mli/comm/channel/dragon_channel.py
@@ -24,17 +24,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import base64
-import sys
 import typing as t
 
 import dragon.channels as dch
-import dragon.infrastructure.facts as df
-import dragon.infrastructure.parameters as dp
-import dragon.managed_memory as dm
-import dragon.utils as du
 
 import smartsim._core.mli.comm.channel.channel as cch
+import smartsim._core.mli.comm.channel.dragon_util as drg_util
 from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
 
@@ -49,75 +44,6 @@
 unnecessary retries when creating a local channel."""
 
 
-def _channel_to_descriptor(channel: dch.Channel) -> str:
-    """Utility method for converting a channel to a descriptor string.
-
-    :param channel: The dragon channel to convert
-    :returns: The descriptor string
-    """
-    if channel is None:
-        raise SmartSimError("Channel is not available to create a descriptor")
-
-    serialized_ch = channel.serialize()
-    return base64.b64encode(serialized_ch).decode("utf-8")
-
-
-def _pool_to_descriptor(pool: dm.MemoryPool) -> str:
-    """Utility method for converting a pool to a descriptor string.
-
-    :param pool: The memory pool to convert
-    :returns: The descriptor string"""
-    if pool is None:
-        raise SmartSimError("Memory pool is not available to create a descriptor")
-
-    serialized_pool = pool.serialize()
-    return base64.b64encode(serialized_pool).decode("utf-8")
-
-
-def create_local(capacity: int = 0) -> dch.Channel:
-    """Creates a Channel attached to the local memory pool. Replacement for
-    direct calls to `dch.Channel.make_process_local()` to enable
-    supplying a channel capacity.
-
-    :param capacity: The number of events the channel can buffer; uses the default
-    buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied
-    :returns: The instantiated channel
-    :raises SmartSimError: If unable to attach local channel
-    """
-    pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd))
-    pool_descriptor = _pool_to_descriptor(pool)
-    channel: t.Optional[dch.Channel] = None
-    offset = 0
-
-    global LAST_OFFSET
-    if LAST_OFFSET:
-        offset = LAST_OFFSET
-
-    capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE
-
-    while not channel:
-        # search for an open channel ID
-        offset += 1
-        cid = df.BASE_USER_MANAGED_CUID + offset
-        try:
-            channel = dch.Channel(mem_pool=pool, c_uid=cid, capacity=capacity)
-            LAST_OFFSET = offset
-            descriptor = _channel_to_descriptor(channel)
-            logger.debug(
-                "Local channel creatd: "
-                f"{cid=}, {pool_descriptor=}, {capacity=}, {descriptor=}"
-            )
-        except dch.ChannelError as e:
-            if offset < 100:
-                logger.warning(f"Channnel id {cid} is not open. Retrying...")
-            else:
-                LAST_OFFSET = 0
-                logger.error(f"All attempts to attach local channel have failed")
-                raise SmartSimError("Failed to attach local channel") from e
-
-    return channel
-
-
 class DragonCommChannel(cch.CommChannelBase):
     """Passes messages by writing to a Dragon channel."""
 
@@ -126,7 +52,7 @@ def __init__(self, channel: "dch.Channel") -> None:
 
         :param channel: A channel to use for communications
         """
-        descriptor = _channel_to_descriptor(channel)
+        descriptor = drg_util.channel_to_descriptor(channel)
         super().__init__(descriptor)
         self._channel = channel
 
@@ -175,23 +101,6 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]:
 
             return messages
 
-    @property
-    def descriptor_string(self) -> str:
-        """Return the channel descriptor for the underlying dragon channel
-        as a string. Automatically performs base64 encoding to ensure the
-        string can be used in a call to `from_descriptor`.
-
-        :returns: String representation of channel descriptor
-        :raises ValueError: If unable to convert descriptor to a string
-        """
-        if isinstance(self._descriptor, str):
-            return self._descriptor
-
-        if isinstance(self._descriptor, bytes):
-            return base64.b64encode(self._descriptor).decode("utf-8")
-
-        raise ValueError(f"Unable to convert channel descriptor: {self._descriptor}")
-
     @classmethod
     def from_descriptor(
         cls,
@@ -199,36 +108,29 @@ def from_descriptor(
     ) -> "DragonCommChannel":
         """A factory method that creates an instance from a descriptor string.
 
-        :param descriptor: The descriptor that uniquely identifies the resource. Output
-        from `descriptor_string` is correctly encoded.
+        :param descriptor: The descriptor that uniquely identifies the resource.
         :returns: An attached DragonCommChannel
-        :raises SmartSimError: If creation of comm channel fails"""
+        :raises SmartSimError: If creation of comm channel fails
+        """
         try:
             if isinstance(descriptor, bytes):
                 raise ValueError("Descriptor must be a string")
 
-            utf8_descriptor: t.Union[str, bytes] = descriptor
-            if isinstance(descriptor, str):
-                utf8_descriptor = descriptor.encode("utf-8")
-
-            # todo: ensure the bytes argument and condition are removed
-            # after refactoring the RPC models
-
-            actual_descriptor = base64.b64decode(utf8_descriptor)
-            channel = dch.Channel.attach(actual_descriptor)
+            channel = drg_util.descriptor_to_channel(descriptor)
             return DragonCommChannel(channel)
-        except Exception as e:
+        except Exception as ex:
             raise SmartSimError(
                 f"Failed to create dragon comm channel: {descriptor}"
-            ) from e
+            ) from ex
 
     @classmethod
     def from_local(cls, _descriptor: t.Optional[str] = None) -> "DragonCommChannel":
-        """A factory method that creates a local channel instance
+        """A factory method that creates a local channel instance.
 
+        :param _descriptor: Unused placeholder
         :returns: An attached DragonCommChannel"""
         try:
-            channel = dch.Channel.make_process_local()
+            channel = drg_util.create_local()
             return DragonCommChannel(channel)
         except:
             logger.error(f"Failed to create local dragon comm channel", exc_info=True)
diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py
index 22593f63c..13eb58a2e 100644
--- a/smartsim/_core/mli/comm/channel/dragon_fli.py
+++ b/smartsim/_core/mli/comm/channel/dragon_fli.py
@@ -27,18 +27,13 @@
 # isort: off
 from dragon import fli
 import dragon.channels as dch
-import dragon.infrastructure.facts as df
-import dragon.infrastructure.parameters as dp
-import dragon.managed_memory as dm
-import dragon.utils as du
 
 # isort: on
 
-import base64
 import typing as t
 
 import smartsim._core.mli.comm.channel.channel as cch
-from smartsim._core.mli.comm.channel.dragon_channel import create_local
+import smartsim._core.mli.comm.channel.dragon_util as drg_util
 from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
 
@@ -60,22 +55,19 @@ def __init__(
         :param sender_supplied: Flag indicating if the FLI uses sender-supplied streams
         :param buffer_size: Maximum number of sent messages that can be buffered
         """
-        descriptor = base64.b64encode(fli_.serialize()).decode("utf-8")
+        descriptor = drg_util.channel_to_descriptor(fli_)
         super().__init__(descriptor)
 
         self._fli = fli_
         self._channel: t.Optional["dch.Channel"] = (
-            create_local(buffer_size) if sender_supplied else None
+            drg_util.create_local(buffer_size) if sender_supplied else None
         )
 
-    def send(
-        self, value: bytes, timeout: float = 0.001, blocking: bool = False
-    ) -> None:
+    def send(self, value: bytes, timeout: float = 0.001) -> None:
         """Send a message through the underlying communication channel.
 
         :param value: The value to send
         :param timeout: Maximum time to wait (in seconds) for messages to send
-        :param blocking: Block returning until the message has been received
         :raises SmartSimError: If sending message fails
         """
         try:
@@ -110,13 +102,6 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]:
                     ) from e
         return messages
 
-    @classmethod
-    def _string_descriptor_to_fli(cls, descriptor: str) -> "fli.FLInterface":
-        """Helper method to convert a string-safe, encoded descriptor back
-        into its original byte format"""
-        descriptor_ = base64.b64decode(descriptor.encode("utf-8"))
-        return fli.FLInterface.attach(descriptor_)
-
     @classmethod
     def from_sender_supplied_descriptor(
         cls,
@@ -128,7 +113,7 @@ def from_sender_supplied_descriptor(
         :returns: An attached DragonFLIChannel"""
         try:
             return DragonFLIChannel(
-                fli_=cls._string_descriptor_to_fli(descriptor),
+                fli_=drg_util.descriptor_to_fli(descriptor),
                 sender_supplied=True,
             )
         except:
@@ -153,7 +138,7 @@ def from_descriptor(
 
         try:
             return DragonFLIChannel(
-                fli_=cls._string_descriptor_to_fli(descriptor),
+                fli_=drg_util.descriptor_to_fli(descriptor),
                 sender_supplied=False,
             )
         except Exception as e:
diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py
new file mode 100644
index 000000000..2980dc9a6
--- /dev/null
+++ b/smartsim/_core/mli/comm/channel/dragon_util.py
@@ -0,0 +1,137 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import base64
+import typing as t
+
+import dragon.channels as dch
+import dragon.fli as fli
+import dragon.infrastructure.facts as df
+import dragon.infrastructure.parameters as dp
+import dragon.managed_memory as dm
+import dragon.utils as du
+
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+DEFAULT_CHANNEL_BUFFER_SIZE = 500
+"""Maximum number of messages that can be buffered. DragonCommChannel will
+raise an exception if no clients consume messages before the buffer is filled."""
+
+LAST_OFFSET = 0
+"""The last offset used to create a local channel. This is used to avoid
+unnecessary retries when creating a local channel."""
+
+
+def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str:
+    """Utility method for converting a channel to a descriptor string.
+
+    :param channel: The dragon channel to convert
+    :returns: The descriptor string
+    """
+    if channel is None:
+        raise SmartSimError("Channel is not available to create a descriptor")
+
+    serialized_ch = channel.serialize()
+    return base64.b64encode(serialized_ch).decode("utf-8")
+
+
+def pool_to_descriptor(pool: dm.MemoryPool) -> str:
+    """Utility method for converting a pool to a descriptor string.
+
+    :param pool: The memory pool to convert
+    :returns: The descriptor string"""
+    if pool is None:
+        raise SmartSimError("Memory pool is not available to create a descriptor")
+
+    serialized_pool = pool.serialize()
+    return base64.b64encode(serialized_pool).decode("utf-8")
+
+
+def descriptor_to_fli(descriptor: str) -> "fli.FLInterface":
+    """Helper method to attach a new FLI instance given
+    the string-encoded descriptor.
+
+    :param descriptor: The descriptor of an FLI to attach to
+    :returns: The attached dragon FLI"""
+    descriptor_ = base64.b64decode(descriptor.encode("utf-8"))
+    return fli.FLInterface.attach(descriptor_)
+
+
+def descriptor_to_channel(descriptor: str) -> dch.Channel:
+    """Helper method to attach a new Channel instance given
+    the string-encoded descriptor.
+
+    :param descriptor: The descriptor of a channel to attach to
+    :returns: The attached dragon Channel"""
+    descriptor_ = base64.b64decode(descriptor.encode("utf-8"))
+    return dch.Channel.attach(descriptor_)
+
+
+def create_local(capacity: int = 0) -> dch.Channel:
+    """Creates a Channel attached to the local memory pool. Replacement for
+    direct calls to `dch.Channel.make_process_local()` to enable
+    supplying a channel capacity.
+
+    :param capacity: The number of events the channel can buffer; uses the default
+    buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied
+    :returns: The instantiated channel
+    :raises SmartSimError: If unable to attach local channel
+    """
+    pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd))
+    pool_descriptor = pool_to_descriptor(pool)
+    channel: t.Optional[dch.Channel] = None
+    offset = 0
+
+    global LAST_OFFSET
+    if LAST_OFFSET:
+        offset = LAST_OFFSET
+
+    capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE
+
+    while not channel:
+        # search for an open channel ID
+        offset += 1
+        channel_id = df.BASE_USER_MANAGED_CUID + offset
+        try:
+            channel = dch.Channel(mem_pool=pool, c_uid=channel_id, capacity=capacity)
+            LAST_OFFSET = offset
+            descriptor = channel_to_descriptor(channel)
+            logger.debug(
+                "Local channel created: "
+                f"{channel_id=}, {pool_descriptor=}, {capacity=}, {descriptor=}"
+            )
+        except dch.ChannelError as e:
+            if offset < 100:
+                logger.warning(f"Channnel id `{channel_id}` is not open. Retrying...")
+            else:
+                LAST_OFFSET = 0
+                logger.error(f"All attempts to attach local channel have failed")
+                raise SmartSimError("Failed to attach local channel") from e
+
+    return channel
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index 83c255fe7..f8515220f 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -92,17 +92,27 @@ def __init__(
 
     @property
     def wait_timeout(self) -> float:
+        """Retrieve the wait timeout for this feature store. The wait timeout is
+        applied to all calls to `wait_for`.
+
+        :returns: The wait timeout (in seconds).
+        """
         return self._wait_timeout
 
     @wait_timeout.setter
     def wait_timeout(self, value: float) -> None:
+        """Set the wait timeout (in seconds) for this feature store. The wait
+        timeout is applied to all calls to `wait_for`.
+
+        :param value: The new value to set
+        """
         self._wait_timeout = value
 
     @property
     def notification_channels(self) -> t.Sequence[str]:
         """Retrieve descriptors for all registered MLI notification channels.
 
-        :returns: The list of descriptors
+        :returns: The list of channel descriptors
         """
         if self.MLI_NOTIFY_CONSUMERS in self:
             stored_consumers = self[self.MLI_NOTIFY_CONSUMERS]
@@ -119,26 +129,26 @@ def notification_channels(self, values: t.Sequence[str]) -> None:
 
     @property
     def backend_channel(self) -> t.Optional[str]:
-        """Retrieve the channel descriptor exposed by the MLI backend for events
+        """Retrieve the channel descriptor exposed by the MLI backend for events.
 
-        :returns: a stringified channel descriptor"""
+        :returns: The channel descriptor"""
         if self.MLI_BACKEND_CONSUMER in self:
             return str(self[self.MLI_BACKEND_CONSUMER])
         return None
 
     @backend_channel.setter
     def backend_channel(self, value: str) -> None:
-        """Set the channel exposed by the MLI backend for events
+        """Set the channel exposed by the MLI backend for events.
 
-        :param value: a stringified channel descriptor"""
+        :param value: The stringified channel descriptor"""
         self[self.MLI_BACKEND_CONSUMER] = value
 
     @property
     def worker_queue(self) -> t.Optional[str]:
         """Retrieve the channel descriptor exposed by the MLI
-        backend to send work to an MLI worker manager instance
+        backend to send work to an MLI worker manager instance.
 
-        :returns: a stringified channel descriptor"""
+        :returns: The channel descriptor, if found. Otherwise, `None`"""
         if self.MLI_WORKER_QUEUE in self:
             return str(self[self.MLI_WORKER_QUEUE])
         return None
@@ -146,18 +156,20 @@ def worker_queue(self) -> t.Optional[str]:
     @worker_queue.setter
     def worker_queue(self, value: str) -> None:
         """Set the channel descriptor exposed by the MLI
-        backend to send work to an MLI worker manager instance
+        backend to send work to an MLI worker manager instance.
 
-        :param value: a stringified channel descriptor"""
+        :param value: The channel descriptor"""
         self[self.MLI_WORKER_QUEUE] = value
 
     @property
     def creation_date(self) -> str:
-        """Return the creation date for the backbone feature store"""
+        """Return the creation date for the backbone feature store.
+
+        :returns: The string-formatted date when feature store was created"""
         return str(self[self._CREATED_ON])
 
     def _record_creation_data(self) -> None:
-        """Write the creation timestamp to the feature store"""
+        """Write the creation timestamp to the feature store."""
         if self._CREATED_ON not in self:
             if not self._allow_reserved_writes:
                 logger.warning(
@@ -180,9 +192,8 @@ def from_writable_descriptor(
         try:
             return BackboneFeatureStore(dragon_ddict.DDict.attach(descriptor), True)
         except Exception as ex:
-            logger.error(f"Error creating dragon feature store: {descriptor}")
             raise SmartSimError(
-                f"Error creating dragon feature store: {descriptor}"
+                f"Error creating backbone feature store: {descriptor}"
             ) from ex
 
     def _check_wait_timeout(
@@ -568,8 +579,8 @@ def __init__(
         :param backbone: The MLI backbone feature store
         :param filters: A list of event types to deliver. when empty, all
         events will be delivered
-        :param timeout: Maximum time to wait for messages to arrive; may be overridden
-        on individual calls to `receive`
+        :param name: A user-friendly name for logging. If not provided, an
+        auto-generated GUID will be used
         :raises ValueError: If batch_timeout <= 0
         """
         if batch_timeout is not None and batch_timeout <= 0:
diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
index 0256b1a51..4eeeac32f 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
@@ -101,7 +101,6 @@ def from_descriptor(
             logger.debug(f"Attaching to FeatureStore with descriptor: {descriptor}")
             return cls(dragon_ddict.DDict.attach(descriptor))
         except Exception as ex:
-            logger.error(f"Error creating dragon feature store: {descriptor}")
             raise SmartSimError(
-                f"Error creating dragon feature store: {descriptor}"
+                f"Error creating dragon feature store from descriptor: {descriptor}"
             ) from ex
diff --git a/smartsim/log.py b/smartsim/log.py
index a28112efa..c8fed9329 100644
--- a/smartsim/log.py
+++ b/smartsim/log.py
@@ -258,10 +258,12 @@ def log_to_file(
     """Installs a second filestream handler to the root logger,
     allowing subsequent logging calls to be sent to filename.
 
-    :param filename: the name of the desired log file.
-    :param log_level: as defined in get_logger.  Can be specified
+    :param filename: The name of the desired log file.
+    :param log_level: As defined in get_logger.  Can be specified
                       to allow the file to store more or less verbose
                       logging information.
+    :param logger: If supplied, a logger to add the file stream logging
+    behavior to. By default, a new logger is instantiated.
     """
     if logger is None:
         logger = logging.getLogger("SmartSim")
diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py
index 3e786cf05..c248300ca 100644
--- a/smartsim/protoclient.py
+++ b/smartsim/protoclient.py
@@ -27,7 +27,6 @@
 # isort: off
 # pylint: disable=unused-import,import-error
 import dragon
-from dragon import fli
 import dragon.channels
 from dragon.globalservices.api_setup import connect_to_infrastructure
 
@@ -49,11 +48,9 @@
 import numpy
 import torch
 
-from smartsim._core.mli.comm.channel.dragon_channel import (
-    DragonCommChannel,
-    create_local,
-)
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
     EventBroadcaster,
@@ -86,16 +83,17 @@ class ProtoClient:
     triggering QueueFull exceptions."""
 
     @staticmethod
-    def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore:
+    def _attach_to_backbone() -> BackboneFeatureStore:
         """Use the supplied environment variables to attach
         to a pre-existing backbone featurestore. Requires the
         environment to contain `_SMARTSIM_INFRA_BACKBONE`
-        environment variable
+        environment variable.
 
-        :returns: the attached backbone featurestore"""
+        :returns: The attached backbone featurestore
+        """
         # todo: ensure this env var from config loader or constant
         descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None)
-        if descriptor is None:
+        if descriptor is None or not descriptor:
             raise SmartSimError(
                 "Missing required backbone configuration in environment: "
                 f"{BackboneFeatureStore.MLI_BACKBONE}"
@@ -104,12 +102,16 @@ def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore:
         backbone = t.cast(
             BackboneFeatureStore, BackboneFeatureStore.from_descriptor(descriptor)
         )
-        backbone.wait_timeout = wait_timeout
         return backbone
 
     def _attach_to_worker_queue(self) -> DragonFLIChannel:
         """Wait until the backbone contains the worker queue configuration,
-        then attach an FLI to the given worker queue"""
+        then attach an FLI to the given worker queue.
+
+        :returns: The attached FLI channel
+        :raises: SmartSimError if the required configuration is not found in the
+        backbone feature store
+        """
 
         descriptor = ""
         try:
@@ -120,50 +122,36 @@ def _attach_to_worker_queue(self) -> DragonFLIChannel:
             descriptor = str(config[BackboneFeatureStore.MLI_WORKER_QUEUE])
         except Exception as ex:
             logger.info(
-                f"Unable to rerieve {BackboneFeatureStore.MLI_WORKER_QUEUE} "
+                f"Unable to retrieve {BackboneFeatureStore.MLI_WORKER_QUEUE} "
                 "to attach to the worker queue."
             )
-            raise ValueError("Unable to locate worker queue using backbone") from ex
+            raise SmartSimError("Unable to locate worker queue using backbone") from ex
 
         return DragonFLIChannel.from_descriptor(descriptor)
 
-    @classmethod
-    def _create_worker_channels(
-        cls,
-    ) -> t.Tuple[dragon.channels.Channel, dragon.channels.Channel]:
-        """Create channels to be used for communication to and from the worker queue.
-
-        :returns: A tuple containing the native from and to
-        Channels as (from_channel, to_channel).
-        """
-
-        _from_worker_ch_raw = create_local(cls._DEFAULT_WORK_QUEUE_SIZE)
-        _to_worker_ch_raw = create_local(cls._DEFAULT_WORK_QUEUE_SIZE)
-
-        return _from_worker_ch_raw, _to_worker_ch_raw
-
     def _create_broadcaster(self) -> EventProducer:
         """Create an event publisher that will broadcast updates to
         other MLI components. This publisher
 
-        :returns: the event publisher instance"""
+        :returns: the event publisher instance
+        """
         broadcaster = EventBroadcaster(
             self._backbone, DragonCommChannel.from_descriptor
         )
         return broadcaster
 
     def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None:
-        """Initialize the client instance
+        """Initialize the client instance.
 
         :param timing_on: Flag indicating if timing information should be
         written to file
         :param wait_timeout: Maximum wait time (in seconds) allowed to attach to the
         worker queue
-
-        :raises: SmartSimError if unable to attach to a backbone featurestore"""
-        # todo: determine a way to make this work in tests.
-        #  - consider catching the import exception and defaulting rank to 0
+        :raises: SmartSimError if unable to attach to a backbone featurestore
+        """
         if MPI is not None:
+            # todo: determine a way to make MPI work in the test environment
+            #  - consider catching the import exception and defaulting rank to 0
             comm = MPI.COMM_WORLD
             rank: int = comm.Get_rank()
         else:
@@ -173,12 +161,12 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None:
 
         connect_to_infrastructure()
 
-        self._backbone = self._attach_to_backbone(wait_timeout=self.backbone_timeout)
+        self._backbone = self._attach_to_backbone()
+        self._backbone.wait_timeout = self.backbone_timeout
         self._to_worker_fli = self._attach_to_worker_queue()
 
-        channels = self._create_worker_channels()
-        self._from_worker_ch = channels[0]
-        self._to_worker_ch = channels[1]
+        self._from_worker_ch = create_local(self._DEFAULT_WORK_QUEUE_SIZE)
+        self._to_worker_ch = create_local(self._DEFAULT_WORK_QUEUE_SIZE)
 
         self._publisher = self._create_broadcaster()
 
@@ -199,14 +187,29 @@ def backbone_timeout(self) -> float:
         return self._backbone_timeout or self._DEFAULT_BACKBONE_TIMEOUT
 
     def _add_label_to_timings(self, label: str) -> None:
+        """Adds a new label into the timing dictionary to prepare for
+        receiving timing events.
+
+        :param label: The label to create storage for
+        """
         if label not in self._timings:
             self._timings[label] = []
 
     @staticmethod
     def _format_number(number: t.Union[numbers.Number, float]) -> str:
+        """Utility function for formatting numbers consistently for logs.
+
+        :param number: The number to convert to a formatted string
+        :returns: The formatted string containing the number
+        """
         return f"{number:0.4e}"
 
     def start_timings(self, batch_size: numbers.Number) -> None:
+        """Configure the client to begin storing timing information.
+
+        :param bach_size: The size of batches to generate as inputs
+        to the model
+        """
         if self._timing_on:
             self._add_label_to_timings("batch_size")
             self._timings["batch_size"].append(self._format_number(batch_size))
@@ -214,6 +217,7 @@ def start_timings(self, batch_size: numbers.Number) -> None:
             self._interm = time.perf_counter()
 
     def end_timings(self) -> None:
+        """Configure the client to stop storing timing information."""
         if self._timing_on and self._start is not None:
             self._add_label_to_timings("total_time")
             self._timings["total_time"].append(
@@ -221,6 +225,10 @@ def end_timings(self) -> None:
             )
 
     def measure_time(self, label: str) -> None:
+        """Measures elapsed time since the last recorded signal.
+
+        :param label: The label to measure time for
+        """
         if self._timing_on and self._interm is not None:
             self._add_label_to_timings(label)
             self._timings[label].append(
@@ -229,6 +237,11 @@ def measure_time(self, label: str) -> None:
             self._interm = time.perf_counter()
 
     def print_timings(self, to_file: bool = False) -> None:
+        """Print timing information to standard output.
+
+        :param to_file: If `True`, also saves timing information
+        to the files `timings.npy` and `timings.txt`
+        """
         print(" ".join(self._timings.keys()))
 
         value_array = numpy.array(self._timings.values(), dtype=float)
@@ -240,6 +253,14 @@ def print_timings(self, to_file: bool = False) -> None:
             numpy.savetxt("timings.txt", value_array)
 
     def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any:
+        """Execute a bach of inference requests with the supplied ML model.
+
+        :param model: The raw bytes or path to a pytorch model
+        :param batch: The tensor batch to perform inference on
+        :returns: The inference results
+        :raises: ValueError if the worker queue is not configured properly
+        in the environment variables
+        """
         tensors = [batch.numpy()]
         self.perf_timer.start_timings("batch_size", batch.shape[0])
         built_tensor_desc = MessageHandler.build_tensor_descriptor(
@@ -301,9 +322,11 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any:
         return result
 
     def set_model(self, key: str, model: bytes) -> None:
-        # todo: incorrect usage of backbone here to store
-        # user models? are we using the backbone if they do NOT
-        # have a feature store of their own?
+        """Write the supplied model to the feature store.
+
+        :param key: The unique key used to identify the model
+        :param model: The raw bytes of the model to execute
+        """
         self._backbone[key] = model
 
         # notify components of a change in the data at this key
diff --git a/tests/dragon/channel.py b/tests/dragon/channel.py
index efabb00c0..4c46359c2 100644
--- a/tests/dragon/channel.py
+++ b/tests/dragon/channel.py
@@ -40,9 +40,10 @@ class FileSystemCommChannel(CommChannelBase):
     """Passes messages by writing to a file"""
 
     def __init__(self, key: pathlib.Path) -> None:
-        """Initialize the FileSystemCommChannel instance
+        """Initialize the FileSystemCommChannel instance.
 
-        :param key: a path to the root directory of the feature store"""
+        :param key: a path to the root directory of the feature store
+        """
         self._lock = threading.RLock()
 
         super().__init__(key.as_posix())
@@ -57,7 +58,7 @@ def send(self, value: bytes, timeout: float = 0) -> None:
         """Send a message throuh the underlying communication channel.
 
         :param value: The value to send
-        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :param timeout: maximum time to wait (in seconds) for messages to send
         """
         with self._lock:
             # write as text so we can add newlines as delimiters
@@ -67,11 +68,12 @@ def send(self, value: bytes, timeout: float = 0) -> None:
                 logger.debug(f"FileSystemCommChannel {self._file_path} sent message")
 
     def recv(self, timeout: float = 0) -> t.List[bytes]:
-        """Receives message(s) through the underlying communication channel
+        """Receives message(s) through the underlying communication channel.
 
         :param timeout: maximum time to wait (in seconds) for messages to arrive
         :returns: the received message
-        :raises SmartSimError: if the descriptor points to a missing file"""
+        :raises SmartSimError: if the descriptor points to a missing file
+        """
         with self._lock:
             messages: t.List[bytes] = []
             if not self._file_path.exists():
@@ -100,7 +102,7 @@ def recv(self, timeout: float = 0) -> t.List[bytes]:
             return messages
 
     def clear(self) -> None:
-        """Create an empty file for events"""
+        """Create an empty file for events."""
         if self._file_path.exists():
             self._file_path.unlink()
         self._file_path.touch()
@@ -110,10 +112,11 @@ def from_descriptor(
         cls,
         descriptor: str,
     ) -> "FileSystemCommChannel":
-        """A factory method that creates an instance from a descriptor string
+        """A factory method that creates an instance from a descriptor string.
 
         :param descriptor: The descriptor that uniquely identifies the resource
-        :returns: An attached FileSystemCommChannel"""
+        :returns: An attached FileSystemCommChannel
+        """
         try:
             path = pathlib.Path(descriptor)
             return FileSystemCommChannel(path)
diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py
index 0631e11e6..0e16be5e2 100644
--- a/tests/dragon/test_dragon_backend.py
+++ b/tests/dragon/test_dragon_backend.py
@@ -128,93 +128,3 @@ def mock_event_handler(event: EventBase) -> None:
     # using backbone.notification_channels helper method
     notify_list = str(values[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS]).split(",")
     assert new_consumer.descriptor in set(notify_list)
-
-
-@pytest.mark.parametrize(
-    "content",
-    [
-        pytest.param(b"a"),
-        pytest.param(b"new byte string"),
-    ],
-)
-def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch):
-    """A descriptor can be stored, loaded, and reattached"""
-    chan = Channel.make_process_local()
-    queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv(
-        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
-    )
-
-    config = EnvironmentConfigLoader(
-        featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
-    )
-    config_queue = config.get_queue()
-
-    _ = config_queue.send(content)
-
-    old_recv = queue.recvh()
-    result, _ = old_recv.recv_bytes()
-    assert result == content
-
-
-def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
-    """The serialized descriptors of a loaded and unloaded
-    queue are the same"""
-    chan = Channel.make_process_local()
-    queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv(
-        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
-    )
-
-    config = EnvironmentConfigLoader(
-        featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_descriptor,
-    )
-    config_queue = config.get_queue()
-    assert config_queue._fli.serialize() == queue.serialize()
-
-
-def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
-    """An incorrect serialized descriptor will fails to attach"""
-    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring")
-    config = EnvironmentConfigLoader(
-        featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=None,
-        queue_factory=DragonFLIChannel.from_descriptor,
-    )
-
-    with pytest.raises(DragonFLIError):
-        config.get_queue()
-
-
-def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
-    """Verify the dragon feature store is loaded correctly by the
-    EnvironmentConfigLoader to demonstrate featurestore_factory correctness"""
-    feature_store = DragonFeatureStore(DDict())
-    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor)
-
-    config = EnvironmentConfigLoader(
-        featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=None,
-        queue_factory=None,
-    )
-
-    print(f"calling config.get_backbone: `{feature_store.descriptor}`")
-
-    backbone = config.get_backbone()
-    assert backbone is not None
-
-
-def test_environment_variables_not_set():
-    """EnvironmentConfigLoader getters return None when environment
-    variables are not set"""
-    config = EnvironmentConfigLoader(
-        featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonCommChannel.from_descriptor,
-    )
-    assert config.get_backbone() is None
-    assert config.get_queue() is None
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index b8c2af9c0..4f45614d9 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -94,7 +94,9 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
 
 def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
     """An incorrect serialized descriptor will fails to attach"""
+
     monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring")
+
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=None,
@@ -123,13 +125,17 @@ def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
     assert backbone is not None
 
 
-def test_environment_variables_not_set():
+def test_environment_variables_not_set(monkeypatch: pytest.MonkeyPatch):
     """EnvironmentConfigLoader getters return None when environment
     variables are not set"""
-    config = EnvironmentConfigLoader(
-        featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonCommChannel.from_descriptor,
-    )
-    assert config.get_backbone() is None
-    assert config.get_queue() is None
+    with monkeypatch.context() as patch:
+        patch.setenv("_SMARTSIM_INFRA_BACKBONE", "")
+        patch.setenv("_SMARTSIM_REQUEST_QUEUE", "")
+
+        config = EnvironmentConfigLoader(
+            featurestore_factory=DragonFeatureStore.from_descriptor,
+            callback_factory=DragonCommChannel.from_descriptor,
+            queue_factory=DragonCommChannel.from_descriptor,
+        )
+        assert config.get_backbone() is None
+        assert config.get_queue() is None
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index 434bc5eab..f59501df1 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -129,9 +129,9 @@ def test_eventconsumer_eventpublisher_integration(
     capp_channel = DragonCommChannel(capp_channel_)
     back_channel = DragonCommChannel(back_channel_)
 
-    wmgr_consumer_descriptor = wmgr_channel.descriptor_string
-    capp_consumer_descriptor = capp_channel.descriptor_string
-    back_consumer_descriptor = back_channel.descriptor_string
+    wmgr_consumer_descriptor = wmgr_channel.descriptor
+    capp_consumer_descriptor = capp_channel.descriptor
+    back_consumer_descriptor = back_channel.descriptor
 
     # create some consumers to receive messages
     wmgr_consumer = EventConsumer(
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index ccc63def7..fa6f99001 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -33,8 +33,8 @@
 from smartsim._core.mli.comm.channel.dragon_channel import (
     DEFAULT_CHANNEL_BUFFER_SIZE,
     DragonCommChannel,
-    create_local,
 )
+from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
     EventBroadcaster,
@@ -82,17 +82,13 @@ def test_eventconsumer_eventpublisher_integration(
     backbone["test_dir"] = test_dir
     assert backbone["test_dir"] == test_dir
 
-    wmgr_channel_ = Channel.make_process_local()
-    capp_channel_ = Channel.make_process_local()
-    back_channel_ = Channel.make_process_local()
-
-    wmgr_channel = DragonCommChannel(wmgr_channel_)
-    capp_channel = DragonCommChannel(capp_channel_)
-    back_channel = DragonCommChannel(back_channel_)
+    wmgr_channel = DragonCommChannel(create_local())
+    capp_channel = DragonCommChannel(create_local())
+    back_channel = DragonCommChannel(create_local())
 
-    wmgr_consumer_descriptor = wmgr_channel.descriptor_string
-    capp_consumer_descriptor = capp_channel.descriptor_string
-    back_consumer_descriptor = back_channel.descriptor_string
+    wmgr_consumer_descriptor = wmgr_channel.descriptor
+    capp_consumer_descriptor = capp_channel.descriptor
+    back_consumer_descriptor = back_channel.descriptor
 
     # create some consumers to receive messages
     wmgr_consumer = EventConsumer(
@@ -166,18 +162,20 @@ def test_eventconsumer_max_dequeue(
     storage_for_dragon_fs: t.Any,
 ) -> None:
     """Verify that a consumer does not sit and collect messages indefinitely
-    by checking that a consumer returns after a maximum timeout is exceeded
+    by checking that a consumer returns after a maximum timeout is exceeded.
 
-    :param num_events: the total number of events to raise in the test
-    :param batch_timeout: the maximum wait time for a message to be sent.
-    :param storage_for_dragon_fs: the dragon storage engine to use"""
+    :param num_events: Total number of events to raise in the test
+    :param batch_timeout: Maximum wait time (in seconds) for a message to be sent
+    :param max_batches_expected: Maximum number of receives that should occur
+    :param storage_for_dragon_fs: Dragon storage engine to use
+    """
 
     mock_storage = storage_for_dragon_fs
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
 
     wmgr_channel_ = Channel.make_process_local()
     wmgr_channel = DragonCommChannel(wmgr_channel_)
-    wmgr_consumer_descriptor = wmgr_channel.descriptor_string
+    wmgr_consumer_descriptor = wmgr_channel.descriptor
 
     # create some consumers to receive messages
     wmgr_consumer = EventConsumer(
@@ -242,7 +240,7 @@ def test_channel_buffer_size(
 
     wmgr_channel_ = create_local(buffer_size)  # <--- vary buffer size
     wmgr_channel = DragonCommChannel(wmgr_channel_)
-    wmgr_consumer_descriptor = wmgr_channel.descriptor_string
+    wmgr_consumer_descriptor = wmgr_channel.descriptor
 
     # create a broadcaster to publish messages. create no consumers to
     # push the number of sent messages past the allotted buffer size
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index f5a55a381..4310b6de0 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -35,14 +35,14 @@
 dragon = pytest.importorskip("dragon")
 
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
-from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
     EventBroadcaster,
     OnWriteFeatureStore,
 )
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict
-from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys
 from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
 
@@ -56,24 +56,40 @@
 
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
-WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
+WORK_QUEUE_KEY = BackboneFeatureStore.MLI_WORKER_QUEUE
 logger = get_logger(__name__)
 
 
 @pytest.fixture(scope="session")
 def storage_for_dragon_fs() -> t.Dict[str, str]:
-    # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
+    """Fixture that creates a dragon distributed dictionary.
+
+    :returns: The attached distributed dictionary
+    """
     return dragon_ddict.DDict(1, 2, 4 * 1024**2)
 
 
 @pytest.fixture(scope="session")
 def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
+    """Fixture that creates a dragon backbone feature store.
+
+    :param storage_for_dragon_fs:
+    :returns: The backbone feature store
+    :returns: The attached `BackboneFeatureStore`
+    """
+
     return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
 
 
 @pytest.fixture(scope="session")
 def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
-    """a stand-in for the worker manager so a worker queue exists"""
+    """Fixture that creates a dragon FLI channel as a stand-in for the
+    worker queue created by the worker.
+
+    :param the_backbone: The backbone feature store to update
+    with the worker queue descriptor.
+    :returns: The attached `DragonFLIChannel`
+    """
 
     # create the FLI
     to_worker_channel = create_local()
@@ -91,28 +107,13 @@ def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
     return comm_channel
 
 
-@pytest.fixture
-def storage_for_dragon_fs_with_req_queue(
-    storage_for_dragon_fs: t.Dict[str, str]
-) -> t.Dict[str, str]:
-    # create a valid FLI so any call to attach does not fail
-    channel_ = Channel.make_process_local()
-    fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
-    comm_channel = DragonFLIChannel(fli_, True)
-
-    storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor
-    return storage_for_dragon_fs
-
-
 @pytest.mark.parametrize(
     "wait_timeout, exp_wait_max",
     [
         # aggregate the 1+1+1 into 3 on remaining parameters
-        pytest.param(
-            0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps", marks=pytest.mark.skip
-        ),
-        pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps", marks=pytest.mark.skip),
-        pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps", marks=pytest.mark.skip),
+        pytest.param(0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps"),
+        pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"),
+        pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"),
     ],
 )
 def test_protoclient_timeout(
@@ -150,11 +151,20 @@ def test_protoclient_timeout(
     assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}"
 
 
-def test_protoclient_initialization_no_backbone():
+def test_protoclient_initialization_no_backbone(
+    monkeypatch: pytest.MonkeyPatch, the_worker_queue: DragonFLIChannel
+):
     """Verify that attempting to start the client without required environment variables
-    results in an exception. NOTE: Backbone env var is not set"""
+    results in an exception.
+
+    :param the_worker_queue: Passing the worker queue fixture to ensure
+    the worker queue environment is correctly configured.
+
+    NOTE: os.environ[BackboneFeatureStore.MLI_BACKBONE] is not set"""
+
+    with monkeypatch.context() as patch, pytest.raises(SmartSimError) as ex:
+        patch.setenv(BackboneFeatureStore.MLI_BACKBONE, "")
 
-    with pytest.raises(SmartSimError) as ex:
         ProtoClient(timing_on=False)
 
     # confirm the missing value error has been raised
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index e111f8c74..b6be86177 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -25,11 +25,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import gc
-import logging
 import os
 import pathlib
 import subprocess as sp
-import sys
 import time
 import typing as t
 from queue import Empty
@@ -54,11 +52,9 @@
 from dragon.data.ddict.ddict import DDict
 from dragon.managed_memory import MemoryAlloc
 
-from smartsim._core.mli.comm.channel.dragon_channel import (
-    DragonCommChannel,
-    create_local,
-)
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.control.request_dispatcher import (
     RequestBatch,
     RequestDispatcher,
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index b2ddb3481..0feefdb51 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -1,339 +1,313 @@
-# # BSD 2-Clause License
-# #
-# # Copyright (c) 2021-2024, Hewlett Packard Enterprise
-# # All rights reserved.
-# #
-# # Redistribution and use in source and binary forms, with or without
-# # modification, are permitted provided that the following conditions are met:
-# #
-# # 1. Redistributions of source code must retain the above copyright notice, this
-# #    list of conditions and the following disclaimer.
-# #
-# # 2. Redistributions in binary form must reproduce the above copyright notice,
-# #    this list of conditions and the following disclaimer in the documentation
-# #    and/or other materials provided with the distribution.
-# #
-# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# import io
-# import logging
-# import pathlib
-# import time
-
-# import pytest
-
-# torch = pytest.importorskip("torch")
-# dragon = pytest.importorskip("dragon")
-
-# import multiprocessing as mp
-
-# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
-#     BackboneFeatureStore,
-# )
-# from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import OutputDescriptor
-
-# try:
-#     mp.set_start_method("dragon")
-# except Exception:
-#     pass
-
-# import os
-
-# import dragon.channels as dch
-# import torch.nn as nn
-# from dragon import fli
-# from dragon.data.ddict.ddict import DDict
-
-# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
-# from smartsim._core.mli.infrastructure.control.worker_manager import (
-#     EnvironmentConfigLoader,
-#     WorkerManager,
-# )
-# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
-#     DragonFeatureStore,
-# )
-# from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
-# from smartsim._core.mli.message_handler import MessageHandler
-# from smartsim.log import get_logger
-
-# from .utils.channel import FileSystemCommChannel
-
-# logger = get_logger(__name__)
-# # The tests in this file belong to the dragon group
-# pytestmark = pytest.mark.dragon
-
-
-# class MiniModel(nn.Module):
-#     def __init__(self):
-#         super().__init__()
-
-#         self._name = "mini-model"
-#         self._net = torch.nn.Linear(2, 1)
-
-#     def forward(self, input):
-#         return self._net(input)
-
-#     @property
-#     def bytes(self) -> bytes:
-#         """Returns the model serialized to a byte stream"""
-#         buffer = io.BytesIO()
-#         scripted = torch.jit.trace(self._net, self.get_batch())
-#         torch.jit.save(scripted, buffer)
-#         return buffer.getvalue()
-
-#     @classmethod
-#     def get_batch(cls) -> "torch.Tensor":
-#         return torch.randn((100, 2), dtype=torch.float32)
-
-
-# def create_model(model_path: pathlib.Path) -> pathlib.Path:
-#     """Create a simple torch model and persist to disk for
-#     testing purposes.
-
-#     TODO: remove once unit tests are in place"""
-#     if not model_path.parent.exists():
-#         model_path.parent.mkdir(parents=True, exist_ok=True)
-
-#     model_path.unlink(missing_ok=True)
-
-#     mini_model = MiniModel()
-#     torch.save(mini_model, model_path)
-
-#     return model_path
-
-
-# def load_model() -> bytes:
-#     """Create a simple torch model in memory for testing"""
-#     mini_model = MiniModel()
-#     return mini_model.bytes
-
-
-# def mock_messages(
-#     feature_store_root_dir: pathlib.Path,
-#     comm_channel_root_dir: pathlib.Path,
-#     kill_queue: mp.Queue,
-# ) -> None:
-#     """Mock event producer for triggering the inference pipeline"""
-#     feature_store_root_dir.mkdir(parents=True, exist_ok=True)
-#     comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
-
-#     iteration_number = 0
-
-#     config_loader = EnvironmentConfigLoader(
-#         featurestore_factory=DragonFeatureStore.from_descriptor,
-#         callback_factory=FileSystemCommChannel.from_descriptor,
-#         queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
-#     )
-#     backbone = config_loader.get_backbone()
-
-#     worker_queue = config_loader.get_queue()
-#     if worker_queue is None:
-#         queue_desc = config_loader._queue_descriptor
-#         logger.warn(
-#             f"FLI input queue not loaded correctly from config_loader: {queue_desc}"
-#         )
-
-#     model_key = "mini-model"
-#     model_bytes = load_model()
-#     backbone[model_key] = model_bytes
-
-#     message_model_key = MessageHandler.build_model_key(
-#         model_key, backbone.descriptor
-#     )
-
-#     while True:
-#         if not kill_queue.empty():
-#             return
-#         iteration_number += 1
-#         time.sleep(1)
-#         # 1. for demo, ignore upstream and just put stuff into downstream
-#         # 2. for demo, only one downstream but we'd normally have to filter
-#         #       msg content and send to the correct downstream (worker) queue
-#         # timestamp = time.time_ns()
-#         # mock_channel = test_path / f"brainstorm-{timestamp}.txt"
-#         # mock_channel.touch()
-
-#         # thread - just look for key (wait for keys)
-#         # call checkpoint, try to get non-persistent key, it blocks
-#         # working set size > 1 has side-effects
-#         # only incurs cost when working set size has been exceeded
-
-#         channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt"
-#         callback_channel = FileSystemCommChannel(pathlib.Path(channel_key))
-
-#         # input_key = f"my-input-{iteration_number}"
-#         output_key = f"my-output-{iteration_number}"
-
-#         batch = MiniModel.get_batch()
-#         shape = batch.shape
-#         batch_bytes = batch.numpy().tobytes()
-#         # backbone[input_key] = batch_bytes
-
-#         logger.debug(f"Model content: {backbone[model_key][:20]}")
-#         # logger.debug(f"Input content: {backbone[input_key][:20]}")
-
-#         fsd = backbone.descriptor
-
-#         # message_tensor_output_key = MessageHandler.build_tensor_key(
-#         #     output_key, fsd
-#         # )
-#         # message_tensor_input_key = MessageHandler.build_tensor_key(
-#         #     input_key, fsd
-#         # )
-
-#         input_descriptor = MessageHandler.build_tensor_descriptor(
-#             "f", "float32", list(shape)
-#         )
-
-#         # output_descriptor = MessageHandler.build_output_tensor_descriptor(
-#         #     "f", [], "float32", list(shape)
-#         # )
-
-#         # The first request is always the metadata...
-#         request = MessageHandler.build_request(
-#             reply_channel=callback_channel.descriptor,
-#             # model=message_model_key,
-#             model=MessageHandler.build_model(model_bytes, "mini-model", "1.0"),
-#             # inputs=[message_tensor_input_key],
-#             inputs=[input_descriptor],
-#             # outputs=[message_tensor_output_key],
-#             outputs=[],
-#             # output_descriptors=[output_descriptor],
-#             output_descriptors=[],
-#             custom_attributes=None,
-#         )
-#         request_bytes = MessageHandler.serialize_request(request)
-#         fli: DragonFLIChannel = worker_queue
-
-#         with fli._fli.sendh(timeout=None, stream_channel=fli._channel) as sendh:
-#             sendh.send_bytes(request_bytes)
-#             sendh.send_bytes(batch_bytes)
-
-#         # worker_queue.send(request_bytes)
-#         # follow up with the actual data
-#         # worker_queue.send(batch_bytes)
-
-#         logger.info("published message")
-
-#         if iteration_number > 5:
-#             return
-
-
-# def mock_mli_infrastructure_mgr():
-#     config_loader = EnvironmentConfigLoader(
-#         featurestore_factory=DragonFeatureStore.from_descriptor,
-#         callback_factory=FileSystemCommChannel.from_descriptor,
-#         queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
-#     )
-
-#     integrated_worker = TorchWorker
-
-#     worker_manager = WorkerManager(
-#         config_loader,
-#         integrated_worker,
-#         as_service=True,
-#         cooldown=10,
-#         device="cpu",
-#         dispatcher_queue=mp.Queue(maxsize=0),
-#     )
-#     worker_manager.execute()
-
-
-# @pytest.fixture
-# def prepare_environment(test_dir: str) -> pathlib.Path:
-#     """Cleanup prior outputs to run demo repeatedly"""
-#     path = pathlib.Path(f"{test_dir}/workermanager.log")
-#     logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
-#     return path
-
-
-# def test_worker_manager(prepare_environment: pathlib.Path) -> None:
-#     """Test the worker manager"""
-
-#     test_path = prepare_environment
-#     fs_path = test_path / "feature_store"
-#     comm_path = test_path / "comm_store"
-
-#     # old instantiation code start
-#     # to_worker_channel = dch.Channel.make_process_local()
-#     # to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-#     # to_worker_fli_serialized = to_worker_fli.serialize()
-
-#     # # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
-#     # # or test environment may be unable to send messages w/queue
-#     # descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
-#     # os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
-
-#     mgr_per_node = 1
-#     num_nodes = 2
-#     mem_per_node = 1024**3
-#     total_mem = num_nodes * mem_per_node
-
-#     storage = DDict(
-#         managers_per_node=mgr_per_node,
-#         n_nodes=num_nodes,
-#         total_mem=total_mem,
-#     )
-#     backbone = BackboneFeatureStore(storage, allow_reserved_writes=True)
-
-#     to_worker_channel = dch.Channel.make_process_local()
-#     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-
-#     to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, sender_supplied=True)
-
-#     # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader
-#     # or test environment may be unable to send messages w/queue
-#     os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor
-#     os.environ["_SMARTSIM_INFRA_BACKBONE"] = backbone.descriptor
-
-#     config_loader = EnvironmentConfigLoader(
-#         featurestore_factory=DragonFeatureStore.from_descriptor,
-#         callback_factory=FileSystemCommChannel.from_descriptor,
-#         queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
-#     )
-#     integrated_worker_type = TorchWorker
-
-#     worker_manager = WorkerManager(
-#         config_loader,
-#         integrated_worker_type,
-#         as_service=True,
-#         cooldown=5,
-#         device="cpu",
-#         dispatcher_queue=mp.Queue(maxsize=0),
-#     )
-
-#     worker_queue = config_loader.get_queue()
-#     if worker_queue is None:
-#         logger.warn(
-#             f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}"
-#         )
-#     backbone.worker_queue = to_worker_fli_comm_channel.descriptor
-
-#     # create a mock client application to populate the request queue
-#     kill_queue = mp.Queue()
-#     msg_pump = mp.Process(
-#         target=mock_messages,
-#         args=(fs_path, comm_path, kill_queue),
-#     )
-#     msg_pump.start()
-
-#     # create a process to execute commands
-#     process = mp.Process(target=mock_mli_infrastructure_mgr)
-
-#     # let it send some messages before starting the worker manager
-#     msg_pump.join(timeout=5)
-#     process.start()
-#     msg_pump.join(timeout=5)
-#     kill_queue.put_nowait("kill!")
-#     process.join(timeout=5)
-#     msg_pump.kill()
-#     process.kill()
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import logging
+import pathlib
+import time
+
+import pytest
+
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+
+torch = pytest.importorskip("torch")
+dragon = pytest.importorskip("dragon")
+
+import multiprocessing as mp
+
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import OutputDescriptor
+
+try:
+    mp.set_start_method("dragon")
+except Exception:
+    pass
+
+import os
+
+import dragon.channels as dch
+import torch.nn as nn
+from dragon import fli
+from dragon.data.ddict.ddict import DDict
+
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.control.worker_manager import (
+    EnvironmentConfigLoader,
+    WorkerManager,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+from .utils.channel import FileSystemCommChannel
+
+logger = get_logger(__name__)
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+class MiniModel(nn.Module):
+    """A torch model that can be executed by the default torch worker"""
+
+    def __init__(self):
+        """Initialize the model."""
+        super().__init__()
+
+        self._name = "mini-model"
+        self._net = torch.nn.Linear(2, 1)
+
+    def forward(self, input):
+        """Execute a forward pass."""
+        return self._net(input)
+
+    @property
+    def bytes(self) -> bytes:
+        """Retrieve the serialized model
+
+        :returns: The byte stream of the model file
+        """
+        buffer = io.BytesIO()
+        scripted = torch.jit.trace(self._net, self.get_batch())
+        torch.jit.save(scripted, buffer)
+        return buffer.getvalue()
+
+    @classmethod
+    def get_batch(cls) -> "torch.Tensor":
+        """Generate a single batch of data with the correct
+        shape for inference.
+
+        :returns: The batch as a torch tensor
+        """
+        return torch.randn((100, 2), dtype=torch.float32)
+
+
+def create_model(model_path: pathlib.Path) -> pathlib.Path:
+    """Create a simple torch model and persist to disk for
+    testing purposes.
+
+    :param model_path: The path to the torch model file
+    """
+    if not model_path.parent.exists():
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+
+    model_path.unlink(missing_ok=True)
+
+    mini_model = MiniModel()
+    torch.save(mini_model, model_path)
+
+    return model_path
+
+
+def load_model() -> bytes:
+    """Create a simple torch model in memory for testing."""
+    mini_model = MiniModel()
+    return mini_model.bytes
+
+
+def mock_messages(
+    feature_store_root_dir: pathlib.Path,
+    comm_channel_root_dir: pathlib.Path,
+    kill_queue: mp.Queue,
+) -> None:
+    """Mock event producer for triggering the inference pipeline"""
+    feature_store_root_dir.mkdir(parents=True, exist_ok=True)
+    comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
+
+    iteration_number = 0
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+    )
+    backbone = config_loader.get_backbone()
+
+    worker_queue = config_loader.get_queue()
+    if worker_queue is None:
+        queue_desc = config_loader._queue_descriptor
+        logger.warn(
+            f"FLI input queue not loaded correctly from config_loader: {queue_desc}"
+        )
+
+    model_key = "mini-model"
+    model_bytes = load_model()
+    backbone[model_key] = model_bytes
+
+    while True:
+        if not kill_queue.empty():
+            return
+        iteration_number += 1
+        time.sleep(1)
+
+        channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt"
+        callback_channel = FileSystemCommChannel(pathlib.Path(channel_key))
+
+        batch = MiniModel.get_batch()
+        shape = batch.shape
+        batch_bytes = batch.numpy().tobytes()
+
+        logger.debug(f"Model content: {backbone[model_key][:20]}")
+
+        input_descriptor = MessageHandler.build_tensor_descriptor(
+            "f", "float32", list(shape)
+        )
+
+        # The first request is always the metadata...
+        request = MessageHandler.build_request(
+            reply_channel=callback_channel.descriptor,
+            model=MessageHandler.build_model(model_bytes, "mini-model", "1.0"),
+            inputs=[input_descriptor],
+            outputs=[],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+        request_bytes = MessageHandler.serialize_request(request)
+        fli: DragonFLIChannel = worker_queue
+
+        with fli._fli.sendh(timeout=None, stream_channel=fli._channel) as sendh:
+            sendh.send_bytes(request_bytes)
+            sendh.send_bytes(batch_bytes)
+
+        logger.info("published message")
+
+        if iteration_number > 5:
+            return
+
+
+def mock_mli_infrastructure_mgr() -> None:
+    """Create resources normally instanatiated by the infrastructure
+    management portion of the DragonBackend
+    """
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+    )
+
+    integrated_worker = TorchWorker
+
+    worker_manager = WorkerManager(
+        config_loader,
+        integrated_worker,
+        as_service=True,
+        cooldown=10,
+        device="cpu",
+        dispatcher_queue=mp.Queue(maxsize=0),
+    )
+    worker_manager.execute()
+
+
+@pytest.fixture
+def prepare_environment(test_dir: str) -> pathlib.Path:
+    """Cleanup prior outputs to run demo repeatedly.
+
+    :param tes_dir: the directory to prepare
+    :returns: The path to the log file"""
+    path = pathlib.Path(f"{test_dir}/workermanager.log")
+    logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
+    return path
+
+
+def test_worker_manager(prepare_environment: pathlib.Path) -> None:
+    """Test the worker manager.
+
+    :param prepare_environment: Pass this fixture to configure
+    global resources before the worker manager executes
+    """
+
+    test_path = prepare_environment
+    fs_path = test_path / "feature_store"
+    comm_path = test_path / "comm_store"
+
+    mgr_per_node = 1
+    num_nodes = 2
+    mem_per_node = 1024**3
+    total_mem = num_nodes * mem_per_node
+
+    storage = DDict(
+        managers_per_node=mgr_per_node,
+        n_nodes=num_nodes,
+        total_mem=total_mem,
+    )
+    backbone = BackboneFeatureStore(storage, allow_reserved_writes=True)
+
+    to_worker_channel = create_local()
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+
+    to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, sender_supplied=True)
+
+    # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader
+    # or test environment may be unable to send messages w/queue
+    os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor
+    os.environ["_SMARTSIM_INFRA_BACKBONE"] = backbone.descriptor
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+    )
+    integrated_worker_type = TorchWorker
+
+    worker_manager = WorkerManager(
+        config_loader,
+        integrated_worker_type,
+        as_service=True,
+        cooldown=5,
+        device="cpu",
+        dispatcher_queue=mp.Queue(maxsize=0),
+    )
+
+    worker_queue = config_loader.get_queue()
+    if worker_queue is None:
+        logger.warn(
+            f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}"
+        )
+    backbone.worker_queue = to_worker_fli_comm_channel.descriptor
+
+    # create a mock client application to populate the request queue
+    kill_queue = mp.Queue()
+    msg_pump = mp.Process(
+        target=mock_messages,
+        args=(fs_path, comm_path, kill_queue),
+    )
+    msg_pump.start()
+
+    # create a process to execute commands
+    process = mp.Process(target=mock_mli_infrastructure_mgr)
+
+    # let it send some messages before starting the worker manager
+    msg_pump.join(timeout=5)
+    process.start()
+    msg_pump.join(timeout=5)
+    kill_queue.put_nowait("kill!")
+    process.join(timeout=5)
+    msg_pump.kill()
+    process.kill()
diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py
index 003d79400..4c46359c2 100644
--- a/tests/dragon/utils/channel.py
+++ b/tests/dragon/utils/channel.py
@@ -42,7 +42,8 @@ class FileSystemCommChannel(CommChannelBase):
     def __init__(self, key: pathlib.Path) -> None:
         """Initialize the FileSystemCommChannel instance.
 
-        :param key: a path to the root directory of the feature store"""
+        :param key: a path to the root directory of the feature store
+        """
         self._lock = threading.RLock()
 
         super().__init__(key.as_posix())
@@ -57,7 +58,7 @@ def send(self, value: bytes, timeout: float = 0) -> None:
         """Send a message throuh the underlying communication channel.
 
         :param value: The value to send
-        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :param timeout: maximum time to wait (in seconds) for messages to send
         """
         with self._lock:
             # write as text so we can add newlines as delimiters
@@ -67,11 +68,12 @@ def send(self, value: bytes, timeout: float = 0) -> None:
                 logger.debug(f"FileSystemCommChannel {self._file_path} sent message")
 
     def recv(self, timeout: float = 0) -> t.List[bytes]:
-        """Receives message(s) through the underlying communication channel
+        """Receives message(s) through the underlying communication channel.
 
         :param timeout: maximum time to wait (in seconds) for messages to arrive
         :returns: the received message
-        :raises SmartSimError: if the descriptor points to a missing file"""
+        :raises SmartSimError: if the descriptor points to a missing file
+        """
         with self._lock:
             messages: t.List[bytes] = []
             if not self._file_path.exists():
@@ -100,7 +102,7 @@ def recv(self, timeout: float = 0) -> t.List[bytes]:
             return messages
 
     def clear(self) -> None:
-        """Create an empty file for events"""
+        """Create an empty file for events."""
         if self._file_path.exists():
             self._file_path.unlink()
         self._file_path.touch()
@@ -110,10 +112,11 @@ def from_descriptor(
         cls,
         descriptor: str,
     ) -> "FileSystemCommChannel":
-        """A factory method that creates an instance from a descriptor string
+        """A factory method that creates an instance from a descriptor string.
 
         :param descriptor: The descriptor that uniquely identifies the resource
-        :returns: An attached FileSystemCommChannel"""
+        :returns: An attached FileSystemCommChannel
+        """
         try:
             path = pathlib.Path(descriptor)
             return FileSystemCommChannel(path)
diff --git a/tests/mli/channel.py b/tests/mli/channel.py
index 1bbf159b1..4c46359c2 100644
--- a/tests/mli/channel.py
+++ b/tests/mli/channel.py
@@ -40,9 +40,10 @@ class FileSystemCommChannel(CommChannelBase):
     """Passes messages by writing to a file"""
 
     def __init__(self, key: pathlib.Path) -> None:
-        """Initialize the FileSystemCommChannel instance
+        """Initialize the FileSystemCommChannel instance.
 
-        :param key: a path to the root directory of the feature store"""
+        :param key: a path to the root directory of the feature store
+        """
         self._lock = threading.RLock()
 
         super().__init__(key.as_posix())
@@ -54,7 +55,7 @@ def __init__(self, key: pathlib.Path) -> None:
         self._file_path.touch()
 
     def send(self, value: bytes, timeout: float = 0) -> None:
-        """Send a message throuh the underlying communication channel
+        """Send a message throuh the underlying communication channel.
 
         :param value: The value to send
         :param timeout: maximum time to wait (in seconds) for messages to send
@@ -67,11 +68,12 @@ def send(self, value: bytes, timeout: float = 0) -> None:
                 logger.debug(f"FileSystemCommChannel {self._file_path} sent message")
 
     def recv(self, timeout: float = 0) -> t.List[bytes]:
-        """Receives message(s) through the underlying communication channel
+        """Receives message(s) through the underlying communication channel.
 
         :param timeout: maximum time to wait (in seconds) for messages to arrive
         :returns: the received message
-        :raises SmartSimError: if the descriptor points to a missing file"""
+        :raises SmartSimError: if the descriptor points to a missing file
+        """
         with self._lock:
             messages: t.List[bytes] = []
             if not self._file_path.exists():
@@ -100,7 +102,7 @@ def recv(self, timeout: float = 0) -> t.List[bytes]:
             return messages
 
     def clear(self) -> None:
-        """Create an empty file for events"""
+        """Create an empty file for events."""
         if self._file_path.exists():
             self._file_path.unlink()
         self._file_path.touch()
@@ -110,10 +112,11 @@ def from_descriptor(
         cls,
         descriptor: str,
     ) -> "FileSystemCommChannel":
-        """A factory method that creates an instance from a descriptor string
+        """A factory method that creates an instance from a descriptor string.
 
         :param descriptor: The descriptor that uniquely identifies the resource
-        :returns: An attached FileSystemCommChannel"""
+        :returns: An attached FileSystemCommChannel
+        """
         try:
             path = pathlib.Path(descriptor)
             return FileSystemCommChannel(path)

From f5b7b7d3ada9e48cf350f9f93e27df088ea1f927 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Wed, 25 Sep 2024 18:07:55 -0500
Subject: [PATCH 10/40] fix infinite loop bug in consumer batch receive

---
 .../storage/backbone_feature_store.py         | 72 ++++++++++--------
 tests/dragon/test_featurestore_integration.py | 73 +++++++++++--------
 2 files changed, 85 insertions(+), 60 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index f8515220f..d247c8952 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -569,7 +569,6 @@ def __init__(
         comm_channel: CommChannelBase,
         backbone: BackboneFeatureStore,
         filters: t.Optional[t.List[EventCategory]] = None,
-        batch_timeout: t.Optional[float] = None,
         name: t.Optional[str] = None,
         event_handler: t.Optional[t.Callable[[EventBase], None]] = None,
     ) -> None:
@@ -583,13 +582,9 @@ def __init__(
         auto-generated GUID will be used
         :raises ValueError: If batch_timeout <= 0
         """
-        if batch_timeout is not None and batch_timeout <= 0:
-            raise ValueError("batch_timeout must be a non-zero, positive value")
-
         self._comm_channel = comm_channel
         self._backbone = backbone
         self._global_filters = filters or []
-        self._global_timeout = batch_timeout or 1.0
         self._name = name
         self._event_handler = event_handler
 
@@ -612,50 +607,67 @@ def name(self) -> str:
         return self._name
 
     def recv(
-        self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0.001
+        self,
+        filters: t.Optional[t.List[EventCategory]] = None,
+        timeout: float = 0.001,
+        batch_timeout: float = 1.0,
     ) -> t.List[EventBase]:
         """Receives available published event(s).
 
         :param filters: Additional filters to add to the global filters configured
         on the EventConsumer instance
-        :param timeout: Maximum time to wait for messages to arrive
+        :param timeout: Maximum time to wait for a single message to arrive
+        :param batch_timeout: Maximum time to wait for messages to arrive; allows
+        multiple batches to be retrieved in one call to `send`
         :returns: A list of events that pass any configured filters
         """
         if filters is None:
             filters = []
 
+        if batch_timeout is not None and batch_timeout <= 0:
+            raise ValueError("batch_timeout must be a non-zero, positive value")
+
         filter_set = {*self._global_filters, *filters}
-        messages: t.List[t.Any] = []
+        all_message_bytes: t.List[bytes] = []
 
-        # use the local timeout to override a global setting
-        start_at = time.time_ns()
+        # firehose as many messages as possible within the batch_timeout
+        start_at = time.time()
+        remaining = batch_timeout
 
-        while msg_bytes_list := self._comm_channel.recv(timeout=timeout):
+        batch_message_bytes = self._comm_channel.recv(timeout=timeout)
+        while batch_message_bytes:
             # remove any empty messages that will fail to decode
-            msg_bytes_list = [msg for msg in msg_bytes_list if msg]
+            all_message_bytes.extend(batch_message_bytes)
+            batch_message_bytes = []
+
+            # avoid getting stuck indefinitely waiting for the channel
+            elapsed = time.time() - start_at
+            remaining = batch_timeout - elapsed
 
-            msg: t.Optional[EventBase] = None
-            if msg_bytes_list:
-                for message in msg_bytes_list:
-                    msg = pickle.loads(message)
+            if remaining > 0:
+                batch_message_bytes = self._comm_channel.recv(timeout=timeout)
 
-                    if not msg:
-                        logger.warning("Unable to unpickle message")
-                        continue
+        events_received: t.List[EventBase] = []
 
-                    # ignore anything that doesn't match a filter (if one is
-                    # supplied), otherwise return everything
-                    if not filter_set or msg.category in filter_set:
-                        messages.append(msg)
+        # Timeout elapsed or no messages received - return the empty list
+        if not all_message_bytes:
+            return events_received
 
-            # avoid getting stuck indefinitely waiting for the channel
-            elapsed = (time.time_ns() - start_at) / 1000000000
-            remaining = elapsed - self._global_timeout
-            if remaining > 0:
-                logger.debug(f"Consumer batch timeout exceeded by: {abs(remaining)}")
-                break
+        for message in all_message_bytes:
+            if not message or message is None:
+                continue
+
+            event = pickle.loads(message)
+            if not event:
+                logger.warning("Unable to unpickle message")
+
+            # skip events that don't pass a filter
+            if filter_set and event.category not in filter_set:
+                continue
+
+            events_received.append(event)
 
-        return messages
+        return events_received
 
     def register(self) -> None:
         """Send an event to register this consumer as a listener"""
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index fa6f99001..0c8679224 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -58,11 +58,23 @@
 pytestmark = pytest.mark.dragon
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def storage_for_dragon_fs() -> t.Dict[str, str]:
     return dragon_ddict.DDict()
 
 
+@pytest.fixture(scope="session")
+def the_worker_channel() -> DragonCommChannel:
+    wmgr_channel_ = create_local()
+    wmgr_channel = DragonCommChannel(wmgr_channel_)
+    return wmgr_channel
+
+
+@pytest.fixture(scope="session")
+def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore:
+    return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
+
+
 def test_eventconsumer_eventpublisher_integration(
     storage_for_dragon_fs: t.Any, test_dir: str
 ) -> None:
@@ -147,19 +159,21 @@ def test_eventconsumer_eventpublisher_integration(
 
 
 @pytest.mark.parametrize(
-    "num_events, batch_timeout",
+    "num_events, batch_timeout, max_batches_expected",
     [
-        pytest.param(1, 1.0, id="under 1s timeout"),
-        pytest.param(20, 1.0, id="test 1s timeout w/20"),
-        pytest.param(50, 1.0, id="test 1s timeout w/50"),
-        pytest.param(60, 0.1, id="small batches"),
-        pytest.param(100, 0.1, id="many small batches"),
+        pytest.param(1, 1.0, 2, id="under 1s timeout"),
+        pytest.param(20, 1.0, 3, id="test 1s timeout 20x"),
+        pytest.param(30, 0.2, 5, id="test 0.2s timeout 30x"),
+        pytest.param(60, 0.4, 4, id="small batches"),
+        pytest.param(100, 0.1, 10, id="many small batches"),
     ],
 )
 def test_eventconsumer_max_dequeue(
     num_events: int,
     batch_timeout: float,
-    storage_for_dragon_fs: t.Any,
+    max_batches_expected: int,
+    the_worker_channel: DragonCommChannel,
+    the_backbone: BackboneFeatureStore,
 ) -> None:
     """Verify that a consumer does not sit and collect messages indefinitely
     by checking that a consumer returns after a maximum timeout is exceeded.
@@ -170,57 +184,56 @@ def test_eventconsumer_max_dequeue(
     :param storage_for_dragon_fs: Dragon storage engine to use
     """
 
-    mock_storage = storage_for_dragon_fs
-    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-
-    wmgr_channel_ = Channel.make_process_local()
-    wmgr_channel = DragonCommChannel(wmgr_channel_)
-    wmgr_consumer_descriptor = wmgr_channel.descriptor
-
     # create some consumers to receive messages
     wmgr_consumer = EventConsumer(
-        wmgr_channel,
-        backbone,
+        the_worker_channel,
+        the_backbone,
         filters=[EventCategory.FEATURE_STORE_WRITTEN],
-        batch_timeout=batch_timeout,
     )
 
     # create a broadcaster to publish messages
     mock_client_app = EventBroadcaster(
-        backbone,
+        the_backbone,
         channel_factory=DragonCommChannel.from_descriptor,
     )
 
     # register all of the consumers even though the OnCreateConsumer really should
     # trigger its registration. event processing is tested elsewhere.
-    backbone.notification_channels = [wmgr_consumer_descriptor]
+    the_backbone.notification_channels = [the_worker_channel.descriptor]
 
     # simulate the app updating a model a lot of times
     for key in (f"key-{i}" for i in range(num_events)):
-        event = OnWriteFeatureStore(backbone.descriptor, key)
-        mock_client_app.send(event, timeout=0.1)
+        event = OnWriteFeatureStore(the_backbone.descriptor, key)
+        mock_client_app.send(event, timeout=0.01)
 
     num_dequeued = 0
+    num_batches = 0
 
-    while wmgr_messages := wmgr_consumer.recv(timeout=0.01):
+    while wmgr_messages := wmgr_consumer.recv(
+        timeout=0.1,
+        batch_timeout=batch_timeout,
+    ):
         # worker manager should not get more than `max_num_msgs` events
         num_dequeued += len(wmgr_messages)
+        num_batches += 1
 
     # make sure we made all the expected dequeue calls and got everything
     assert num_dequeued == num_events
+    assert num_batches > 0
+    assert num_batches < max_batches_expected, "too many recv calls were made"
 
 
 @pytest.mark.parametrize(
     "buffer_size",
     [
-        pytest.param(-1, id="use default: 500"),
-        pytest.param(0, id="use default: 500"),
+        pytest.param(-1, id="replace negative, default to 500"),
+        pytest.param(0, id="replace zero, default to 500"),
         pytest.param(1, id="non-zero buffer size: 1"),
-        pytest.param(500, id="buffer size: 500"),
-        pytest.param(800, id="buffer size: 800"),
+        pytest.param(550, id="larger than default: 550"),
+        pytest.param(800, id="much larger then default: 800"),
         pytest.param(
             1000,
-            id="buffer size: 1000, unreliable in dragon-v0.10",
+            id="very large buffer: 1000, unreliable in dragon-v0.10",
             marks=pytest.mark.skip,
         ),
     ],
@@ -261,8 +274,8 @@ def test_channel_buffer_size(
     # simulate the app updating a model a lot of times
     for key in (f"key-{i}" for i in range(buffer_size)):
         event = OnWriteFeatureStore(backbone.descriptor, key)
-        mock_client_app.send(event, timeout=0.1)
+        mock_client_app.send(event, timeout=0.01)
 
     # adding 1 more over the configured buffer size should report the error
     with pytest.raises(Exception) as ex:
-        mock_client_app.send(event, timeout=0.1)
+        mock_client_app.send(event, timeout=0.01)

From bd037112bd848102e2d538a17fd74eddd1b0c046 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Wed, 25 Sep 2024 18:18:39 -0500
Subject: [PATCH 11/40] Sort imports to solve dragon import issue in non-dragon
 tests

---
 tests/dragon/test_dragon_backend.py | 24 ++++++++----------------
 tests/dragon/test_worker_manager.py |  3 +--
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py
index 0e16be5e2..dc2aceeaa 100644
--- a/tests/dragon/test_dragon_backend.py
+++ b/tests/dragon/test_dragon_backend.py
@@ -30,30 +30,22 @@
 
 import pytest
 
-from smartsim._core.launcher.dragon.dragonBackend import DragonBackend, NodePrioritizer
-from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
-    BackboneFeatureStore,
-    EventBase,
-    EventBroadcaster,
-    EventConsumer,
-    EventSender,
-    OnCreateConsumer,
-)
-from smartsim.log import get_logger
-
 dragon = pytest.importorskip("dragon")
 
-import dragon.utils as du
 from dragon.channels import Channel
 from dragon.data.ddict.ddict import DDict
 from dragon.fli import DragonFLIError, FLInterface
 
+from smartsim._core.launcher.dragon.dragonBackend import DragonBackend
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
-from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
-from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader
-from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
-    DragonFeatureStore,
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+    EventBase,
+    EventBroadcaster,
+    EventConsumer,
+    OnCreateConsumer,
 )
+from smartsim.log import get_logger
 
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index 0feefdb51..69d962940 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -31,8 +31,6 @@
 
 import pytest
 
-from smartsim._core.mli.comm.channel.dragon_util import create_local
-
 torch = pytest.importorskip("torch")
 dragon = pytest.importorskip("dragon")
 
@@ -56,6 +54,7 @@
 from dragon.data.ddict.ddict import DDict
 
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.control.worker_manager import (
     EnvironmentConfigLoader,
     WorkerManager,

From 98260f3eadd6184c53d5c1a04ddd2f3a2a999241 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Wed, 25 Sep 2024 19:28:05 -0500
Subject: [PATCH 12/40] swap session scopes to module to destroy dragon
 resources

---
 tests/dragon/test_featurestore_integration.py | 6 +++---
 tests/dragon/test_protoclient.py              | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index 0c8679224..091610592 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -58,19 +58,19 @@
 pytestmark = pytest.mark.dragon
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def storage_for_dragon_fs() -> t.Dict[str, str]:
     return dragon_ddict.DDict()
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def the_worker_channel() -> DragonCommChannel:
     wmgr_channel_ = create_local()
     wmgr_channel = DragonCommChannel(wmgr_channel_)
     return wmgr_channel
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore:
     return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
 
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index 4310b6de0..c758ce971 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -60,7 +60,7 @@
 logger = get_logger(__name__)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def storage_for_dragon_fs() -> t.Dict[str, str]:
     """Fixture that creates a dragon distributed dictionary.
 
@@ -69,7 +69,7 @@ def storage_for_dragon_fs() -> t.Dict[str, str]:
     return dragon_ddict.DDict(1, 2, 4 * 1024**2)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
     """Fixture that creates a dragon backbone feature store.
 
@@ -81,7 +81,7 @@ def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
     return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
     """Fixture that creates a dragon FLI channel as a stand-in for the
     worker queue created by the worker.

From 5ba2a4216a328b6712a73adaebdc056d7c36e8cc Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Thu, 26 Sep 2024 12:46:46 -0500
Subject: [PATCH 13/40] use make process local to avoid MPI issue, fix some
 test regressions, reuse some test fixtures for speed

---
 .../standalone_worker_manager.py              |   9 +-
 .../_core/mli/comm/channel/dragon_util.py     |  75 ++++---
 .../storage/backbone_feature_store.py         |   4 +-
 tests/dragon/test_environment_loader.py       |   8 +-
 tests/dragon/test_error_handling.py           |  48 ++--
 tests/dragon/test_featurestore.py             | 207 +++++++++++-------
 tests/dragon/test_featurestore_base.py        |  12 +-
 tests/dragon/test_featurestore_integration.py |  46 +++-
 8 files changed, 248 insertions(+), 161 deletions(-)

diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py
index 1d0b11055..fdef4268a 100644
--- a/ex/high_throughput_inference/standalone_worker_manager.py
+++ b/ex/high_throughput_inference/standalone_worker_manager.py
@@ -46,20 +46,17 @@
 import argparse
 import base64
 import multiprocessing as mp
-import optparse
 import os
-import pickle
 import socket
-import sys
 import time
 import typing as t
 
 import cloudpickle
 
 from smartsim._core.entrypoints.service import Service
-from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.control.request_dispatcher import (
     RequestDispatcher,
 )
@@ -71,8 +68,6 @@
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys
-from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase
 from smartsim.log import get_logger
 
 logger = get_logger("Worker Manager Entry Point")
@@ -144,7 +139,7 @@ def service_as_dragon_proc(
 
     backbone = BackboneFeatureStore.from_descriptor(ddict_str)
 
-    to_worker_channel = Channel.make_process_local()
+    to_worker_channel = create_local()
     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
     to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, True)
 
diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py
index 2980dc9a6..014e9c0a4 100644
--- a/smartsim/_core/mli/comm/channel/dragon_util.py
+++ b/smartsim/_core/mli/comm/channel/dragon_util.py
@@ -49,7 +49,7 @@
 
 
 def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str:
-    """Utility method for converting a channel to a descriptor string.
+    """Convert a dragon channel to a descriptor string.
 
     :param channel: The dragon channel to convert
     :returns: The descriptor string
@@ -62,7 +62,7 @@ def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str
 
 
 def pool_to_descriptor(pool: dm.MemoryPool) -> str:
-    """Utility method for converting a pool to a descriptor string.
+    """Convert a dragon memory pool to a descriptor string.
 
     :param pool: The memory pool to convert
     :returns: The descriptor string"""
@@ -74,7 +74,7 @@ def pool_to_descriptor(pool: dm.MemoryPool) -> str:
 
 
 def descriptor_to_fli(descriptor: str) -> "fli.FLInterface":
-    """Helper method to attach a new FLI instance given
+    """Create and attach a new FLI instance given
     the string-encoded descriptor.
 
     :param descriptor: The descriptor of an FLI to attach to
@@ -84,7 +84,7 @@ def descriptor_to_fli(descriptor: str) -> "fli.FLInterface":
 
 
 def descriptor_to_channel(descriptor: str) -> dch.Channel:
-    """Helper method to attach a new Channel instance given
+    """Create and attach a new Channel instance given
     the string-encoded descriptor.
 
     :param descriptor: The descriptor of a channel to attach to
@@ -93,7 +93,7 @@ def descriptor_to_channel(descriptor: str) -> dch.Channel:
     return dch.Channel.attach(descriptor_)
 
 
-def create_local(capacity: int = 0) -> dch.Channel:
+def create_local(_capacity: int = 0) -> dch.Channel:
     """Creates a Channel attached to the local memory pool. Replacement for
     direct calls to `dch.Channel.make_process_local()` to enable
     supplying a channel capacity.
@@ -103,35 +103,38 @@ def create_local(capacity: int = 0) -> dch.Channel:
     :returns: The instantiated channel
     :raises SmartSimError: If unable to attach local channel
     """
-    pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd))
-    pool_descriptor = pool_to_descriptor(pool)
-    channel: t.Optional[dch.Channel] = None
-    offset = 0
-
-    global LAST_OFFSET
-    if LAST_OFFSET:
-        offset = LAST_OFFSET
-
-    capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE
-
-    while not channel:
-        # search for an open channel ID
-        offset += 1
-        channel_id = df.BASE_USER_MANAGED_CUID + offset
-        try:
-            channel = dch.Channel(mem_pool=pool, c_uid=channel_id, capacity=capacity)
-            LAST_OFFSET = offset
-            descriptor = channel_to_descriptor(channel)
-            logger.debug(
-                "Local channel created: "
-                f"{channel_id=}, {pool_descriptor=}, {capacity=}, {descriptor=}"
-            )
-        except dch.ChannelError as e:
-            if offset < 100:
-                logger.warning(f"Channnel id `{channel_id}` is not open. Retrying...")
-            else:
-                LAST_OFFSET = 0
-                logger.error(f"All attempts to attach local channel have failed")
-                raise SmartSimError("Failed to attach local channel") from e
-
+    # current implementation has a bug wrt MPI that must be fixed.
+    # falling back to `make_process_local` and disabling buffer size tests
+
+    # pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd))
+    # pool_descriptor = pool_to_descriptor(pool)
+    # channel: t.Optional[dch.Channel] = None
+    # offset = 0
+
+    # global LAST_OFFSET
+    # if LAST_OFFSET:
+    #     offset = LAST_OFFSET
+
+    # capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE
+
+    # while not channel:
+    #     # search for an open channel ID
+    #     offset += 1
+    #     channel_id = df.BASE_USER_MANAGED_CUID + offset
+    #     try:
+    #         channel = dch.Channel(mem_pool=pool, c_uid=channel_id, capacity=capacity)
+    #         LAST_OFFSET = offset
+    #         descriptor = channel_to_descriptor(channel)
+    #         logger.debug(
+    #             "Local channel created: "
+    #             f"{channel_id=}, {pool_descriptor=}, {capacity=}, {descriptor=}"
+    #         )
+    #     except dch.ChannelError as e:
+    #         if offset < 100:
+    #             logger.warning(f"Channnel id `{channel_id}` is not open. Retrying...")
+    #         else:
+    #             LAST_OFFSET = 0
+    #             logger.error(f"All attempts to attach local channel have failed")
+    #             raise SmartSimError("Failed to attach local channel") from e
+    channel = dch.Channel.make_process_local()
     return channel
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index d247c8952..1542f3811 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -548,16 +548,16 @@ def send(self, event: EventBase, timeout: float = 0.001) -> int:
         :param timeout: Maximum time to wait (in seconds) for messages to send
         :returns: The number of events successfully published
         :raises ValueError: If event serialization fails
+        :raises AttributeError: If event cannot be serialized
         :raises KeyError: If channel fails to attach using registered descriptors
         :raises SmartSimError: If any unexpected error occurs during send
         """
         try:
             self._save_to_buffer(event)
             return self._broadcast(timeout)
-        except (KeyError, ValueError, SmartSimError):
+        except (KeyError, ValueError, AttributeError, SmartSimError):
             raise
         except Exception as ex:
-            logger.exception("An unexpected exception occurred while sending")
             raise SmartSimError("An unexpected failure occurred while sending") from ex
 
 
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index 4f45614d9..47e75109a 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -29,12 +29,12 @@
 dragon = pytest.importorskip("dragon")
 
 import dragon.utils as du
-from dragon.channels import Channel
 from dragon.data.ddict.ddict import DDict
-from dragon.fli import DragonFLIError, FLInterface
+from dragon.fli import FLInterface
 
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
@@ -54,7 +54,7 @@
 )
 def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch):
     """A descriptor can be stored, loaded, and reattached"""
-    chan = Channel.make_process_local()
+    chan = create_local()
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv(
         "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
@@ -77,7 +77,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke
 def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
     """The serialized descriptors of a loaded and unloaded
     queue are the same"""
-    chan = Channel.make_process_local()
+    chan = create_local()
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv(
         "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index b0934b6f5..1e659168b 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -33,7 +33,6 @@
 
 import multiprocessing as mp
 
-import dragon.utils as du
 from dragon.channels import Channel
 from dragon.data.ddict.ddict import DDict
 from dragon.fli import FLInterface
@@ -41,7 +40,7 @@
 
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
-from smartsim._core.mli.infrastructure.control.device_manager import WorkerDevice
+from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.control.request_dispatcher import (
     RequestDispatcher,
 )
@@ -62,7 +61,6 @@
     ExecuteResult,
     FetchInputResult,
     FetchModelResult,
-    InferenceReply,
     InferenceRequest,
     LoadModelResult,
     MachineLearningWorkerBase,
@@ -80,14 +78,26 @@
 pytestmark = pytest.mark.dragon
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
+def the_worker_channel() -> DragonFLIChannel:
+    """Fixture to create a valid descriptor for a worker channel
+    that can be attached to.
+
+    NOTE: using module scoped fixtures drastically improves test run-time"""
+    channel_ = create_local()
+    fli_ = FLInterface(main_ch=channel_, manager_ch=None)
+    comm_channel = DragonFLIChannel(fli_, True)
+    return comm_channel
+
+
+@pytest.fixture(scope="module")
 def backbone_descriptor() -> str:
     # create a shared backbone featurestore
     feature_store = DragonFeatureStore(DDict())
     return feature_store.descriptor
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def app_feature_store() -> FeatureStore:
     # create a standalone feature store to mimic a user application putting
     # data into an application-owned resource (app should not access backbone)
@@ -101,14 +111,11 @@ def setup_worker_manager_model_bytes(
     monkeypatch: pytest.MonkeyPatch,
     backbone_descriptor: str,
     app_feature_store: FeatureStore,
+    the_worker_channel: DragonFLIChannel,
 ):
     integrated_worker_type = IntegratedTorchWorker
 
-    chan = Channel.make_process_local()
-    queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv(
-        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
-    )
+    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor)
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
     monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
 
@@ -160,14 +167,11 @@ def setup_worker_manager_model_key(
     monkeypatch: pytest.MonkeyPatch,
     backbone_descriptor: str,
     app_feature_store: FeatureStore,
+    the_worker_channel: DragonFLIChannel,
 ):
     integrated_worker_type = IntegratedTorchWorker
 
-    chan = Channel.make_process_local()
-    queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv(
-        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
-    )
+    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor)
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
     monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
 
@@ -217,14 +221,11 @@ def setup_request_dispatcher_model_bytes(
     monkeypatch: pytest.MonkeyPatch,
     backbone_descriptor: str,
     app_feature_store: FeatureStore,
+    the_worker_channel: DragonFLIChannel,
 ):
     integrated_worker_type = IntegratedTorchWorker
 
-    chan = Channel.make_process_local()
-    queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv(
-        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
-    )
+    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor)
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
     monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
 
@@ -261,14 +262,11 @@ def setup_request_dispatcher_model_key(
     monkeypatch: pytest.MonkeyPatch,
     backbone_descriptor: str,
     app_feature_store: FeatureStore,
+    the_worker_channel: DragonFLIChannel,
 ):
     integrated_worker_type = IntegratedTorchWorker
 
-    chan = Channel.make_process_local()
-    queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv(
-        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
-    )
+    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor)
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
     monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
 
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index f59501df1..32e1c3a82 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -38,6 +38,7 @@
 
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
     EventBroadcaster,
@@ -68,42 +69,46 @@
 pytestmark = pytest.mark.dragon
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def storage_for_dragon_fs() -> t.Dict[str, str]:
+    """Fixture to instantiate a dragon distributed dictionary.
+
+    NOTE: using module scoped fixtures drastically improves test run-time"""
     return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
 
 
-@pytest.fixture
-def storage_for_dragon_fs_with_req_queue(
-    storage_for_dragon_fs: t.Dict[str, str]
-) -> t.Dict[str, str]:
-    # create a valid FLI so any call to attach does not fail
-    channel_ = Channel.make_process_local()
+@pytest.fixture(scope="module")
+def the_worker_channel() -> DragonFLIChannel:
+    """Fixture to create a valid descriptor for a worker channel
+    that can be attached to.
+
+    NOTE: using module scoped fixtures drastically improves test run-time"""
+    # wmgr_channel_ = create_local()
+    # wmgr_channel = DragonCommChannel(wmgr_channel_)
+    # return wmgr_channel
+    channel_ = create_local()
     fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
     comm_channel = DragonFLIChannel(fli_, True)
+    return comm_channel
 
-    storage_for_dragon_fs[BackboneFeatureStore.MLI_WORKER_QUEUE] = (
-        comm_channel.descriptor
-    )
-    return storage_for_dragon_fs
 
+@pytest.fixture(scope="module")
+def the_backbone(
+    storage_for_dragon_fs: t.Any, the_worker_channel: DragonFLIChannel
+) -> BackboneFeatureStore:
+    """Fixture to create a distributed dragon dictionary and wrap it
+    in a BackboneFeatureStore.
 
-@pytest.fixture
-def storage_for_dragon_fs_with_mock_req_queue(
-    storage_for_dragon_fs: t.Dict[str, str]
-) -> t.Dict[str, str]:
-    # # create a valid FLI so any call to attach does not fail
-    # channel_ = Channel.make_process_local()
-    # fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
-    # comm_channel = DragonFLIChannel(fli_, True)
+    NOTE: using module scoped fixtures drastically improves test run-time"""
 
-    mock_descriptor = "12345"
-    storage_for_dragon_fs[BackboneFeatureStore.MLI_WORKER_QUEUE] = mock_descriptor
-    return storage_for_dragon_fs
+    backbone = BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
+    backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor
+
+    return backbone
 
 
 def test_eventconsumer_eventpublisher_integration(
-    storage_for_dragon_fs: t.Any, test_dir: str
+    the_backbone: BackboneFeatureStore, test_dir: str
 ) -> None:
     """Verify that the publisher and consumer integrate as expected when
     multiple publishers and consumers are sending simultaneously. This
@@ -114,20 +119,13 @@ def test_eventconsumer_eventpublisher_integration(
     :param test_dir: pytest fixture automatically generating unique working
     directories for individual test outputs"""
 
-    mock_storage = storage_for_dragon_fs
-    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-
     # verify ability to write and read from ddict
-    backbone["test_dir"] = test_dir
-    assert backbone["test_dir"] == test_dir
-
-    wmgr_channel_ = Channel.make_process_local()
-    capp_channel_ = Channel.make_process_local()
-    back_channel_ = Channel.make_process_local()
+    the_backbone["test_dir"] = test_dir
+    assert the_backbone["test_dir"] == test_dir
 
-    wmgr_channel = DragonCommChannel(wmgr_channel_)
-    capp_channel = DragonCommChannel(capp_channel_)
-    back_channel = DragonCommChannel(back_channel_)
+    wmgr_channel = DragonCommChannel(create_local())
+    capp_channel = DragonCommChannel(create_local())
+    back_channel = DragonCommChannel(create_local())
 
     wmgr_consumer_descriptor = wmgr_channel.descriptor
     capp_consumer_descriptor = capp_channel.descriptor
@@ -136,32 +134,32 @@ def test_eventconsumer_eventpublisher_integration(
     # create some consumers to receive messages
     wmgr_consumer = EventConsumer(
         wmgr_channel,
-        backbone,
+        the_backbone,
         filters=[EventCategory.FEATURE_STORE_WRITTEN],
     )
     capp_consumer = EventConsumer(
         capp_channel,
-        backbone,
+        the_backbone,
     )
     back_consumer = EventConsumer(
         back_channel,
-        backbone,
+        the_backbone,
         filters=[EventCategory.CONSUMER_CREATED],
     )
 
     # create some broadcasters to publish messages
     mock_worker_mgr = EventBroadcaster(
-        backbone,
+        the_backbone,
         channel_factory=DragonCommChannel.from_descriptor,
     )
     mock_client_app = EventBroadcaster(
-        backbone,
+        the_backbone,
         channel_factory=DragonCommChannel.from_descriptor,
     )
 
     # register all of the consumers even though the OnCreateConsumer really should
     # trigger its registration. event processing is tested elsewhere.
-    backbone.notification_channels = [
+    the_backbone.notification_channels = [
         wmgr_consumer_descriptor,
         capp_consumer_descriptor,
         back_consumer_descriptor,
@@ -172,9 +170,9 @@ def test_eventconsumer_eventpublisher_integration(
     mock_worker_mgr.send(event_1)
 
     # simulate the app updating a model a few times
-    event_2 = OnWriteFeatureStore(backbone.descriptor, "key-1")
-    event_3 = OnWriteFeatureStore(backbone.descriptor, "key-2")
-    event_4 = OnWriteFeatureStore(backbone.descriptor, "key-1")
+    event_2 = OnWriteFeatureStore(the_backbone.descriptor, "key-1")
+    event_3 = OnWriteFeatureStore(the_backbone.descriptor, "key-2")
+    event_4 = OnWriteFeatureStore(the_backbone.descriptor, "key-1")
 
     mock_client_app.send(event_2)
     mock_client_app.send(event_3)
@@ -194,7 +192,7 @@ def test_eventconsumer_eventpublisher_integration(
 
 
 def test_backbone_wait_for_no_keys(
-    storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch
+    the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch
 ) -> None:
     """Verify that asking the backbone to wait for a value succeeds
     immediately and does not cause a wait to occur if the supplied key
@@ -203,15 +201,12 @@ def test_backbone_wait_for_no_keys(
     :param storage_for_dragon_fs: the storage engine to use, prepopulated with
     """
     # set a very low timeout to confirm that it does not wait
-    storage = storage_for_dragon_fs_with_req_queue
-
-    backbone = BackboneFeatureStore(storage)
 
     with monkeypatch.context() as ctx:
         # all keys should be found and the timeout should never be checked.
         ctx.setattr(bbtime, "sleep", mock.MagicMock())
 
-        values = backbone.wait_for([])
+        values = the_backbone.wait_for([])
         assert len(values) == 0
 
         # confirm that no wait occurred
@@ -219,7 +214,7 @@ def test_backbone_wait_for_no_keys(
 
 
 def test_backbone_wait_for_prepopulated(
-    storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch
+    the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch
 ) -> None:
     """Verify that asking the backbone to wait for a value succeed
     immediately and do not cause a wait to occur if the data exists
@@ -227,15 +222,12 @@ def test_backbone_wait_for_prepopulated(
     :param storage_for_dragon_fs: the storage engine to use, prepopulated with
     """
     # set a very low timeout to confirm that it does not wait
-    storage = storage_for_dragon_fs_with_req_queue
-
-    backbone = BackboneFeatureStore(storage)
 
     with monkeypatch.context() as ctx:
         # all keys should be found and the timeout should never be checked.
         ctx.setattr(bbtime, "sleep", mock.MagicMock())
 
-        values = backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE])
+        values = the_backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE], 0.1)
 
         # confirm that wait_for with one key returns one value
         assert len(values) == 1
@@ -248,7 +240,7 @@ def test_backbone_wait_for_prepopulated(
 
 
 def test_backbone_wait_for_prepopulated_dupe(
-    storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch
+    the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch
 ) -> None:
     """Verify that asking the backbone to wait for keys that are duplicated
     results in a single value being returned for each key
@@ -256,19 +248,17 @@ def test_backbone_wait_for_prepopulated_dupe(
     :param storage_for_dragon_fs: the storage engine to use, prepopulated with
     """
     # set a very low timeout to confirm that it does not wait
-    storage = storage_for_dragon_fs_with_req_queue
 
-    backbone = BackboneFeatureStore(storage)
     key1, key2 = "key-1", "key-2"
     value1, value2 = "i-am-value-1", "i-am-value-2"
-    backbone[key1] = value1
-    backbone[key2] = value2
+    the_backbone[key1] = value1
+    the_backbone[key2] = value2
 
     with monkeypatch.context() as ctx:
         # all keys should be found and the timeout should never be checked.
         ctx.setattr(bbtime, "sleep", mock.MagicMock())
 
-        values = backbone.wait_for([key1, key2, key1])  # key1 is duplicated
+        values = the_backbone.wait_for([key1, key2, key1])  # key1 is duplicated
 
         # confirm that wait_for with one key returns one value
         assert len(values) == 2
@@ -294,10 +284,43 @@ def set_value_after_delay(
     logger.debug(f"set_value_after_delay wrote `{value} to backbone[`{key}`]")
 
 
-@pytest.mark.skip(reason="Using mp on build agent is not working correctly")
-@pytest.mark.parametrize("delay", [0, 1, 2, 4, 8])
+@pytest.mark.parametrize(
+    "delay",
+    [
+        pytest.param(
+            0,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            1,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            2,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            4,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            8,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+    ],
+)
 def test_backbone_wait_for_partial_prepopulated(
-    storage_for_dragon_fs_with_mock_req_queue: t.Any, delay: float
+    the_backbone: BackboneFeatureStore, delay: float
 ) -> None:
     """Verify that when data is not all in the backbone, the `wait_for` operation
     continues to poll until it finds everything it needs
@@ -308,19 +331,17 @@ def test_backbone_wait_for_partial_prepopulated(
     """
     # set a very low timeout to confirm that it does not wait
     wait_timeout = 10
-    storage = storage_for_dragon_fs_with_mock_req_queue
-    backbone = BackboneFeatureStore(storage)
 
     key, value = str(uuid.uuid4()), str(random.random() * 10)
 
     logger.debug(f"Starting process to write {key} after {delay}s")
     p = mp.Process(
-        target=set_value_after_delay, args=(backbone.descriptor, key, value, delay)
+        target=set_value_after_delay, args=(the_backbone.descriptor, key, value, delay)
     )
     p.start()
 
     p2 = mp.Process(
-        target=backbone.wait_for,
+        target=the_backbone.wait_for,
         args=([BackboneFeatureStore.MLI_WORKER_QUEUE, key],),
         kwargs={"timeout": wait_timeout},
     )
@@ -330,7 +351,9 @@ def test_backbone_wait_for_partial_prepopulated(
     p2.join()
 
     # both values should be written at this time
-    ret_vals = backbone.wait_for([key, BackboneFeatureStore.MLI_WORKER_QUEUE, key], 0.1)
+    ret_vals = the_backbone.wait_for(
+        [key, BackboneFeatureStore.MLI_WORKER_QUEUE, key], 0.1
+    )
     # confirm that wait_for with two keys returns two values
     assert len(ret_vals) == 2, "values should contain values for both awaited keys"
 
@@ -343,10 +366,43 @@ def test_backbone_wait_for_partial_prepopulated(
     assert ret_vals[key] == value, "verify order of values "
 
 
-@pytest.mark.skip(reason="Using mp on build agent is not working correctly")
-@pytest.mark.parametrize("num_keys", [0, 1, 3, 7, 11])
+@pytest.mark.parametrize(
+    "num_keys",
+    [
+        pytest.param(
+            0,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            1,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            3,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            7,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+        pytest.param(
+            11,
+            marks=pytest.mark.skip(
+                "Must use entrypoint instead of mp.Process to run on build agent"
+            ),
+        ),
+    ],
+)
 def test_backbone_wait_for_multikey(
-    storage_for_dragon_fs_with_req_queue: t.Any,
+    the_backbone: BackboneFeatureStore,
     num_keys: int,
     test_dir: str,
 ) -> None:
@@ -358,8 +414,6 @@ def test_backbone_wait_for_multikey(
     """
     # maximum delay allowed for setter processes
     max_delay = 5
-    storage = storage_for_dragon_fs_with_req_queue
-    backbone = BackboneFeatureStore(storage)
 
     extra_keys = [str(uuid.uuid4()) for _ in range(num_keys)]
     extra_values = [str(uuid.uuid4()) for _ in range(num_keys)]
@@ -371,13 +425,14 @@ def test_backbone_wait_for_multikey(
         assert delay < max_delay, "write delay exceeds test timeout"
         logger.debug(f"Delaying {key} write by {delay} seconds")
         p = mp.Process(
-            target=set_value_after_delay, args=(backbone.descriptor, key, value, delay)
+            target=set_value_after_delay,
+            args=(the_backbone.descriptor, key, value, delay),
         )
         p.start()
         processes.append(p)
 
     p2 = mp.Process(
-        target=backbone.wait_for,
+        target=the_backbone.wait_for,
         args=(extra_keys,),
         kwargs={"timeout": max_delay * 2},
     )
@@ -390,7 +445,7 @@ def test_backbone_wait_for_multikey(
 
     # use without a wait to verify all values are written
     num_keys = len(extra_keys)
-    actual_values = backbone.wait_for(extra_keys, timeout=0.01)
+    actual_values = the_backbone.wait_for(extra_keys, timeout=0.01)
     assert len(extra_keys) == num_keys
 
     # confirm that wait_for returns all the expected values
diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py
index 59a30a3e8..1fa2bf5b4 100644
--- a/tests/dragon/test_featurestore_base.py
+++ b/tests/dragon/test_featurestore_base.py
@@ -452,12 +452,16 @@ def test_eventpublisher_serialize_failure(
         event = OnCreateConsumer(target_descriptor, filters=[])
 
         # patch the __bytes__ implementation to cause pickling to fail during send
-        patch.setattr(event, "__bytes__", lambda x: b"abc")
+        def bad_bytes(self) -> bytes:
+            return b"abc"
+
+        # this patch causes an attribute error when event pickling is attempted
+        patch.setattr(event, "__bytes__", bad_bytes)
 
         backbone.notification_channels = (target_descriptor,)
 
         # send a message into the channel
-        with pytest.raises(ValueError) as ex:
+        with pytest.raises(AttributeError) as ex:
             publisher.send(event)
 
         assert "serialize" in ex.value.args[0]
@@ -729,12 +733,12 @@ def test_eventconsumer_batch_timeout(
 
     with pytest.raises(ValueError) as ex:
         # try to create a consumer w/a max recv size of 0
-        EventConsumer(
+        consumer = EventConsumer(
             channel,
             backbone,
             filters=[EventCategory.FEATURE_STORE_WRITTEN],
-            batch_timeout=invalid_timeout,
         )
+        consumer.recv(batch_timeout=invalid_timeout)
 
     assert "positive" in ex.value.args[0]
 
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index 091610592..fd93f9cfe 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -60,11 +60,18 @@
 
 @pytest.fixture(scope="module")
 def storage_for_dragon_fs() -> t.Dict[str, str]:
-    return dragon_ddict.DDict()
+    """Fixture to instantiate a dragon distributed dictionary.
+
+    NOTE: using module scoped fixtures drastically improves test run-time"""
+    return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
 
 
 @pytest.fixture(scope="module")
 def the_worker_channel() -> DragonCommChannel:
+    """Fixture to create a valid descriptor for a worker channel
+    that can be attached to.
+
+    NOTE: using module scoped fixtures drastically improves test run-time"""
     wmgr_channel_ = create_local()
     wmgr_channel = DragonCommChannel(wmgr_channel_)
     return wmgr_channel
@@ -72,6 +79,10 @@ def the_worker_channel() -> DragonCommChannel:
 
 @pytest.fixture(scope="module")
 def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore:
+    """Fixture to create a distributed dragon dictionary and wrap it
+    in a BackboneFeatureStore.
+
+    NOTE: using module scoped fixtures drastically improves test run-time"""
     return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
 
 
@@ -226,15 +237,36 @@ def test_eventconsumer_max_dequeue(
 @pytest.mark.parametrize(
     "buffer_size",
     [
-        pytest.param(-1, id="replace negative, default to 500"),
-        pytest.param(0, id="replace zero, default to 500"),
-        pytest.param(1, id="non-zero buffer size: 1"),
-        pytest.param(550, id="larger than default: 550"),
-        pytest.param(800, id="much larger then default: 800"),
+        pytest.param(
+            -1,
+            id="replace negative, default to 500",
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
+        ),
+        pytest.param(
+            0,
+            id="replace zero, default to 500",
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
+        ),
+        pytest.param(
+            1,
+            id="non-zero buffer size: 1",
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
+        ),
+        # pytest.param(500, id="maximum size edge case: 500"),
+        pytest.param(
+            550,
+            id="larger than default: 550",
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
+        ),
+        pytest.param(
+            800,
+            id="much larger then default: 800",
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
+        ),
         pytest.param(
             1000,
             id="very large buffer: 1000, unreliable in dragon-v0.10",
-            marks=pytest.mark.skip,
+            marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"),
         ),
     ],
 )

From 1f4e6e302cf8a761ed90312df342b026832cb6a2 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Thu, 26 Sep 2024 13:31:55 -0500
Subject: [PATCH 14/40] more docstrings standard fixes

---
 .../storage/backbone_feature_store.py         | 69 +++++++++++--------
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index 1542f3811..e5f54724c 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -24,7 +24,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import base64
 import enum
 import itertools
 import os
@@ -52,16 +51,6 @@
 logger = get_logger(__name__)
 
 
-def byte_descriptor_to_string(descriptor: bytes) -> str:
-    return base64.b64encode(descriptor).decode("utf-8")
-
-
-def string_descriptor_to_byte(descriptor: str) -> bytes:
-    return base64.b64decode(descriptor.encode("utf-8"))
-
-
-# todo: did i create an arms race where a developer just grabs the backbone
-# and passes it wherever they need a FeatureStore?
 class BackboneFeatureStore(DragonFeatureStore):
     """A DragonFeatureStore wrapper with utility methods for accessing shared
     information stored in the MLI backbone feature store."""
@@ -184,11 +173,12 @@ def from_writable_descriptor(
         cls,
         descriptor: str,
     ) -> "BackboneFeatureStore":
-        """A factory method that creates an instance from a descriptor string
+        """A factory method that creates an instance from a descriptor string.
 
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached DragonFeatureStore
-        :raises SmartSimError: if attachment to DragonFeatureStore fails"""
+        :raises SmartSimError: if attachment to DragonFeatureStore fails
+        """
         try:
             return BackboneFeatureStore(dragon_ddict.DDict.attach(descriptor), True)
         except Exception as ex:
@@ -199,11 +189,12 @@ def from_writable_descriptor(
     def _check_wait_timeout(
         self, start_time: float, timeout: float, indicators: t.Dict[str, bool]
     ) -> None:
-        """Perform timeout verification
+        """Perform timeout verification.
 
         :param start_time: the start time to use for elapsed calculation
         :param timeout: the timeout (in seconds)
-        :param indicators: latest retrieval status for requested keys"""
+        :param indicators: latest retrieval status for requested keys
+        """
         elapsed = time.time() - start_time
         if timeout and elapsed > timeout:
             raise SmartSimError(
@@ -214,10 +205,10 @@ def wait_for(
         self, keys: t.List[str], timeout: float = _DEFAULT_WAIT_TIMEOUT
     ) -> t.Dict[str, t.Union[str, bytes, None]]:
         """Perform a blocking wait until all specified keys have been found
-        in the backbone
+        in the backbone.
 
         :param keys: The required collection of keys to retrieve
-        :param timeout: The maximum wait time in seconds. Overrides class level setting
+        :param timeout: The maximum wait time in seconds
         """
         if timeout < 0:
             timeout = self._DEFAULT_WAIT_TIMEOUT
@@ -255,7 +246,10 @@ def wait_for(
 
     def get_env(self) -> t.Dict[str, str]:
         """Returns a dictionary populated with environment variables necessary to
-        connect a process to the existing backbone instance."""
+        connect a process to the existing backbone instance.
+
+        :returns: The dictionary populated with env vars
+        """
         return {self.MLI_BACKBONE: self.descriptor}
 
 
@@ -263,7 +257,9 @@ class EventCategory(str, enum.Enum):
     """Predefined event types raised by SmartSim backend."""
 
     CONSUMER_CREATED: str = "consumer-created"
+    """Event category for an event raised when a new consumer is created"""
     FEATURE_STORE_WRITTEN: str = "feature-store-written"
+    """Event category for an event raised when a feature store key is written"""
 
 
 @dataclass
@@ -350,10 +346,11 @@ class EventProducer(t.Protocol):
     """Core API of a class that publishes events."""
 
     def send(self, event: EventBase, timeout: float = 0.001) -> int:
-        """The send operation.
+        """Send an event using the configured comm channel.
 
         :param event: The event to send
         :param timeout: Maximum time to wait (in seconds) for messages to send
+        :returns: The number of messages that were sent
         """
 
 
@@ -366,15 +363,24 @@ def __init__(
         backbone: BackboneFeatureStore,
         channel: t.Optional[CommChannelBase],
     ) -> None:
-        """Initialize the instance"""
+        """Initialize the instance.
+
+        :param backbone: The backbone feature store to use
+        :param channel: The comm channel to send events on
+        """
         self._backbone = backbone
         self._channel: t.Optional[CommChannelBase] = channel
 
     def send(self, event: EventBase, timeout: float = 0.001) -> int:
-        """The send operation"""
+        """Send an event using the configured comm channel.
+
+        :param event: The event to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :returns: The number of message copies that were sent
+        :raises: SmartSimError if the comm channel is not configured
+        """
         if self._channel is None:
-            # self._channel = self._channel_factory(event)
-            raise Exception("No channel to send on")
+            raise SmartSimError("No channel to send on")
         num_sent = 0
 
         logger.debug(f"Sending {event} to {self._channel.descriptor}")
@@ -431,8 +437,8 @@ def _save_to_buffer(self, event: EventBase) -> None:
         """Places the event in the buffer to be sent once a consumer
         list is available.
 
-        :param event: The event to serialize and buffer
-        :raises ValueError: If the event cannot be serialized
+        :param event: The event to buffer
+        :raises ValueError: If the event cannot be buffered
         """
         try:
             self._event_buffer.append(event)
@@ -590,7 +596,7 @@ def __init__(
 
     @property
     def descriptor(self) -> str:
-        """The descriptor of the underlying comm channel where events are received
+        """The descriptor of the underlying comm channel.
 
         :returns: The comm channel descriptor"""
         return self._comm_channel.descriptor
@@ -670,7 +676,7 @@ def recv(
         return events_received
 
     def register(self) -> None:
-        """Send an event to register this consumer as a listener"""
+        """Send an event to register this consumer as a listener."""
         descriptor = self._comm_channel.descriptor
         event = OnCreateConsumer(descriptor, self._global_filters)
 
@@ -690,7 +696,14 @@ def register(self) -> None:
             logger.warning("Unable to register. No registrar channel found.")
 
     def listen_once(self, timeout: float = 0.001) -> None:
-        """Function to handle incoming events"""
+        """Receives messages for the consumer a single time.
+
+        NOTE: Executes a single batch-retrieval to receive the maximum
+        number of messages available under batch timeout. To continually
+        listen, use `listen` in a non-blocking thread/process
+
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        """
         logger.debug(f"Starting event listener with {timeout} second timeout")
         logger.debug("Awaiting new messages")
 

From 9c8d127fef516240c638d70055316e83650556dd Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Thu, 26 Sep 2024 16:55:10 -0500
Subject: [PATCH 15/40] reduce default worker connect timeout, fix test timeout
 issue due to measurement error

---
 ex/high_throughput_inference/mock_app.py      |  2 +-
 .../storage/backbone_feature_store.py         |  7 ++---
 smartsim/protoclient.py                       | 17 +++++++++---
 tests/dragon/test_protoclient.py              | 26 ++++++++++---------
 4 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 2886bd5f9..f4db1bc1e 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -105,7 +105,7 @@ def name(self) -> str:
 
     resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt")
 
-    client = ProtoClient(timing_on=True, wait_timeout=0)
+    client = ProtoClient(timing_on=True)
     client.set_model(resnet.name, resnet.model)
 
     if CHECK_RESULTS_AND_MAKE_ALL_SLOWER:
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index e5f54724c..9fcf490e4 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -695,19 +695,20 @@ def register(self) -> None:
         else:
             logger.warning("Unable to register. No registrar channel found.")
 
-    def listen_once(self, timeout: float = 0.001) -> None:
+    def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None:
         """Receives messages for the consumer a single time.
 
         NOTE: Executes a single batch-retrieval to receive the maximum
         number of messages available under batch timeout. To continually
         listen, use `listen` in a non-blocking thread/process
 
-        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :param timeout: Maximum time to wait (in seconds) for a message to arrive
+        :param timeout: Maximum time to wait (in seconds) for a batch to arrive
         """
         logger.debug(f"Starting event listener with {timeout} second timeout")
         logger.debug("Awaiting new messages")
 
-        incoming_messages = self.recv(timeout=timeout)
+        incoming_messages = self.recv(timeout=timeout, batch_timeout=batch_timeout)
 
         if not incoming_messages:
             logger.debug("Consumer received empty message list.")
diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py
index c248300ca..a84a8a261 100644
--- a/smartsim/protoclient.py
+++ b/smartsim/protoclient.py
@@ -74,7 +74,7 @@ class ProtoClient:
     """Proof of concept implementation of a client enabling user applications
     to interact with MLI resources."""
 
-    _DEFAULT_BACKBONE_TIMEOUT = 30.0
+    _DEFAULT_BACKBONE_TIMEOUT = 1.0
     """A default timeout period applied to connection attempts with the
     backbone feature store."""
 
@@ -140,7 +140,11 @@ def _create_broadcaster(self) -> EventProducer:
         )
         return broadcaster
 
-    def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None:
+    def __init__(
+        self,
+        timing_on: bool,
+        backbone_timeout: float = _DEFAULT_BACKBONE_TIMEOUT,
+    ) -> None:
         """Initialize the client instance.
 
         :param timing_on: Flag indicating if timing information should be
@@ -157,7 +161,12 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None:
         else:
             rank = 0
 
-        self._backbone_timeout = wait_timeout
+        if backbone_timeout <= 0:
+            raise ValueError(
+                f"Invalid backbone timeout provided: {backbone_timeout}. "
+                "The value must be greater than zero."
+            )
+        self._backbone_timeout = max(backbone_timeout, 0.1)
 
         connect_to_infrastructure()
 
@@ -184,7 +193,7 @@ def backbone_timeout(self) -> float:
         from the backbone feature store.
 
         :returns: A float indicating the number of seconds to allow"""
-        return self._backbone_timeout or self._DEFAULT_BACKBONE_TIMEOUT
+        return self._backbone_timeout
 
     def _add_label_to_timings(self, label: str) -> None:
         """Adds a new label into the timing dictionary to prepare for
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index c758ce971..6fb44ed3d 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -108,7 +108,7 @@ def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
 
 
 @pytest.mark.parametrize(
-    "wait_timeout, exp_wait_max",
+    "backbone_timeout, exp_wait_max",
     [
         # aggregate the 1+1+1 into 3 on remaining parameters
         pytest.param(0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps"),
@@ -117,7 +117,7 @@ def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
     ],
 )
 def test_protoclient_timeout(
-    wait_timeout: float,
+    backbone_timeout: float,
     exp_wait_max: float,
     the_backbone: BackboneFeatureStore,
     monkeypatch: pytest.MonkeyPatch,
@@ -134,21 +134,23 @@ def test_protoclient_timeout(
 
     # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8]
     # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
-    start_time = time.time()
+
     with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex:
+        start_time = time.time()
         ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
 
-        ProtoClient(False, wait_timeout=wait_timeout)
-
-    end_time = time.time()
-    elapsed = end_time - start_time
+        ProtoClient(timing_on=False, backbone_timeout=backbone_timeout)
+        elapsed = time.time() - start_time
+        logger.info(f"ProtoClient timeout occurred in {elapsed} seconds")
 
-    # todo: revisit. should this trigger any wait if the backbone is set above?
-    # confirm that we met our timeout
-    # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}"
+        # todo: should this trigger any wait if the backbone is set above?
+        # confirm that we met our timeout
+        assert (
+            elapsed >= backbone_timeout
+        ), f"below configured timeout {backbone_timeout}"
 
-    # confirm that the total wait time is aligned with the sleep cycle
-    assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}"
+        # confirm that the total wait time is aligned with the sleep cycle
+        assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}"
 
 
 def test_protoclient_initialization_no_backbone(

From 7442eb113607e5f1d2537fd47750759206271d9e Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Thu, 26 Sep 2024 17:08:34 -0500
Subject: [PATCH 16/40] use constants in tests for env var strings, docstrings,
 remove commented code

---
 tests/dragon/test_error_handling.py | 27 +++++++++++++++++++--------
 tests/dragon/test_featurestore.py   | 18 ++++++++++--------
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 1e659168b..5c04faf0e 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -49,6 +49,9 @@
     exception_handler,
 )
 from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
@@ -115,9 +118,11 @@ def setup_worker_manager_model_bytes(
 ):
     integrated_worker_type = IntegratedTorchWorker
 
-    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor)
+    monkeypatch.setenv(
+        BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor
+    )
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
-    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
+    monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor)
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -171,9 +176,11 @@ def setup_worker_manager_model_key(
 ):
     integrated_worker_type = IntegratedTorchWorker
 
-    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor)
+    monkeypatch.setenv(
+        BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor
+    )
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
-    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
+    monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor)
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -225,9 +232,11 @@ def setup_request_dispatcher_model_bytes(
 ):
     integrated_worker_type = IntegratedTorchWorker
 
-    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor)
+    monkeypatch.setenv(
+        BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor
+    )
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
-    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
+    monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor)
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -266,9 +275,11 @@ def setup_request_dispatcher_model_key(
 ):
     integrated_worker_type = IntegratedTorchWorker
 
-    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor)
+    monkeypatch.setenv(
+        BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor
+    )
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
-    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
+    monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor)
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index 32e1c3a82..3e99762c9 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -73,7 +73,8 @@
 def storage_for_dragon_fs() -> t.Dict[str, str]:
     """Fixture to instantiate a dragon distributed dictionary.
 
-    NOTE: using module scoped fixtures drastically improves test run-time"""
+    NOTE: using module scoped fixtures drastically improves test run-time
+    """
     return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
 
 
@@ -82,10 +83,8 @@ def the_worker_channel() -> DragonFLIChannel:
     """Fixture to create a valid descriptor for a worker channel
     that can be attached to.
 
-    NOTE: using module scoped fixtures drastically improves test run-time"""
-    # wmgr_channel_ = create_local()
-    # wmgr_channel = DragonCommChannel(wmgr_channel_)
-    # return wmgr_channel
+    NOTE: using module scoped fixtures drastically improves test run-time
+    """
     channel_ = create_local()
     fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
     comm_channel = DragonFLIChannel(fli_, True)
@@ -99,7 +98,8 @@ def the_backbone(
     """Fixture to create a distributed dragon dictionary and wrap it
     in a BackboneFeatureStore.
 
-    NOTE: using module scoped fixtures drastically improves test run-time"""
+    NOTE: using module scoped fixtures drastically improves test run-time
+    """
 
     backbone = BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
     backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor
@@ -117,7 +117,8 @@ def test_eventconsumer_eventpublisher_integration(
 
     :param storage_for_dragon_fs: the dragon storage engine to use
     :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs"""
+    directories for individual test outputs
+    """
 
     # verify ability to write and read from ddict
     the_backbone["test_dir"] = test_dir
@@ -276,7 +277,8 @@ def set_value_after_delay(
 
     :param descriptor: the backbone feature store descriptor to attach to
     :param key: the key to write to
-    :param value: a value to write to the key"""
+    :param value: a value to write to the key
+    """
     time.sleep(delay)
 
     backbone = BackboneFeatureStore.from_descriptor(descriptor)

From da20b5f4b927ab56d45639c161b59624f33f0831 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Fri, 27 Sep 2024 10:37:02 -0500
Subject: [PATCH 17/40] docstring formatting in tests

---
 tests/dragon/test_dragon_backend.py           |  2 +-
 tests/dragon/test_environment_loader.py       | 10 +--
 tests/dragon/test_error_handling.py           |  3 +-
 tests/dragon/test_featurestore.py             | 29 +++---
 tests/dragon/test_featurestore_base.py        | 88 +++++++++++--------
 tests/dragon/test_featurestore_integration.py | 15 ++--
 tests/dragon/test_protoclient.py              | 23 +++--
 tests/dragon/test_worker_manager.py           | 16 +++-
 8 files changed, 113 insertions(+), 73 deletions(-)

diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py
index dc2aceeaa..8a48e0026 100644
--- a/tests/dragon/test_dragon_backend.py
+++ b/tests/dragon/test_dragon_backend.py
@@ -55,7 +55,7 @@
 def test_dragonbackend_listener_boostrapping(monkeypatch: pytest.MonkeyPatch):
     """Verify that the dragon backend registration channel correctly
     registers new consumers in the backbone and begins sending events
-    to the new consumers"""
+    to the new consumers."""
 
     backend = DragonBackend(pid=9999)
 
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index 47e75109a..08a0c0135 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -53,7 +53,7 @@
     ],
 )
 def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch):
-    """A descriptor can be stored, loaded, and reattached"""
+    """A descriptor can be stored, loaded, and reattached."""
     chan = create_local()
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv(
@@ -76,7 +76,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke
 
 def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
     """The serialized descriptors of a loaded and unloaded
-    queue are the same"""
+    queue are the same."""
     chan = create_local()
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv(
@@ -93,7 +93,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
 
 
 def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
-    """An incorrect serialized descriptor will fails to attach"""
+    """An incorrect serialized descriptor will fails to attach."""
 
     monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring")
 
@@ -109,7 +109,7 @@ def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
 
 def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
     """Verify the dragon feature store is loaded correctly by the
-    EnvironmentConfigLoader to demonstrate featurestore_factory correctness"""
+    EnvironmentConfigLoader to demonstrate featurestore_factory correctness."""
     feature_store = DragonFeatureStore(DDict())
     monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor)
 
@@ -127,7 +127,7 @@ def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
 
 def test_environment_variables_not_set(monkeypatch: pytest.MonkeyPatch):
     """EnvironmentConfigLoader getters return None when environment
-    variables are not set"""
+    variables are not set."""
     with monkeypatch.context() as patch:
         patch.setenv("_SMARTSIM_INFRA_BACKBONE", "")
         patch.setenv("_SMARTSIM_REQUEST_QUEUE", "")
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 5c04faf0e..6f1e74dca 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -86,7 +86,8 @@ def the_worker_channel() -> DragonFLIChannel:
     """Fixture to create a valid descriptor for a worker channel
     that can be attached to.
 
-    NOTE: using module scoped fixtures drastically improves test run-time"""
+    NOTE: using module scoped fixtures drastically improves test run-time
+    """
     channel_ = create_local()
     fli_ = FLInterface(main_ch=channel_, manager_ch=None)
     comm_channel = DragonFLIChannel(fli_, True)
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index 3e99762c9..ea62fbbeb 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -81,7 +81,7 @@ def storage_for_dragon_fs() -> t.Dict[str, str]:
 @pytest.fixture(scope="module")
 def the_worker_channel() -> DragonFLIChannel:
     """Fixture to create a valid descriptor for a worker channel
-    that can be attached to.
+    that can be attached to. Does not modify environment vars.
 
     NOTE: using module scoped fixtures drastically improves test run-time
     """
@@ -98,6 +98,8 @@ def the_backbone(
     """Fixture to create a distributed dragon dictionary and wrap it
     in a BackboneFeatureStore.
 
+    :param storage_for_dragon_fs: the dragon storage engine to use
+    :param the_worker_channel: a pre-configured worker channel
     NOTE: using module scoped fixtures drastically improves test run-time
     """
 
@@ -113,9 +115,9 @@ def test_eventconsumer_eventpublisher_integration(
     """Verify that the publisher and consumer integrate as expected when
     multiple publishers and consumers are sending simultaneously. This
     test closely tracks the test in tests/test_featurestore.py also named
-    test_eventconsumer_eventpublisher_integration but requires dragon entities
+    test_eventconsumer_eventpublisher_integration but requires dragon entities.
 
-    :param storage_for_dragon_fs: the dragon storage engine to use
+    :param the_backbone: the dragon storage engine to use
     :param test_dir: pytest fixture automatically generating unique working
     directories for individual test outputs
     """
@@ -197,9 +199,9 @@ def test_backbone_wait_for_no_keys(
 ) -> None:
     """Verify that asking the backbone to wait for a value succeeds
     immediately and does not cause a wait to occur if the supplied key
-    list is empty
+    list is empty.
 
-    :param storage_for_dragon_fs: the storage engine to use, prepopulated with
+    :param the_backbone: the storage engine to use, prepopulated with
     """
     # set a very low timeout to confirm that it does not wait
 
@@ -218,9 +220,9 @@ def test_backbone_wait_for_prepopulated(
     the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch
 ) -> None:
     """Verify that asking the backbone to wait for a value succeed
-    immediately and do not cause a wait to occur if the data exists
+    immediately and do not cause a wait to occur if the data exists.
 
-    :param storage_for_dragon_fs: the storage engine to use, prepopulated with
+    :param the_backbone: the storage engine to use, prepopulated with
     """
     # set a very low timeout to confirm that it does not wait
 
@@ -244,9 +246,9 @@ def test_backbone_wait_for_prepopulated_dupe(
     the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch
 ) -> None:
     """Verify that asking the backbone to wait for keys that are duplicated
-    results in a single value being returned for each key
+    results in a single value being returned for each key.
 
-    :param storage_for_dragon_fs: the storage engine to use, prepopulated with
+    :param the_backbone: the storage engine to use, prepopulated with
     """
     # set a very low timeout to confirm that it does not wait
 
@@ -278,6 +280,7 @@ def set_value_after_delay(
     :param descriptor: the backbone feature store descriptor to attach to
     :param key: the key to write to
     :param value: a value to write to the key
+    :param delay: amount of delay to apply before writing the key
     """
     time.sleep(delay)
 
@@ -325,9 +328,9 @@ def test_backbone_wait_for_partial_prepopulated(
     the_backbone: BackboneFeatureStore, delay: float
 ) -> None:
     """Verify that when data is not all in the backbone, the `wait_for` operation
-    continues to poll until it finds everything it needs
+    continues to poll until it finds everything it needs.
 
-    :param storage_for_dragon_fs: the storage engine to use, prepopulated with
+    :param the_backbone: the storage engine to use, prepopulated with
     :param delay: the number of seconds the second process will wait before
     setting the target value in the backbone featurestore
     """
@@ -409,9 +412,9 @@ def test_backbone_wait_for_multikey(
     test_dir: str,
 ) -> None:
     """Verify that asking the backbone to wait for multiple keys results
-    in that number of values being returned
+    in that number of values being returned.
 
-    :param storage_for_dragon_fs: the storage engine to use, prepopulated with
+    :param the_backbone: the storage engine to use, prepopulated with
     :param num_keys: the number of extra keys to set & request in the backbone
     """
     # maximum delay allowed for setter processes
diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py
index 1fa2bf5b4..84594e3c2 100644
--- a/tests/dragon/test_featurestore_base.py
+++ b/tests/dragon/test_featurestore_base.py
@@ -67,12 +67,12 @@ def storage_for_dragon_fs_with_req_queue() -> t.Dict[str, str]:
 
 def boom(*args, **kwargs) -> None:
     """Helper function that blows up when used to mock up
-    some other function"""
+    some other function."""
     raise Exception(f"you shall not pass! {args}, {kwargs}")
 
 
 def test_event_uid() -> None:
-    """Verify that all events include a unique identifier"""
+    """Verify that all events include a unique identifier."""
     uids: t.Set[str] = set()
     num_iters = 1000
 
@@ -90,7 +90,7 @@ def test_event_uid() -> None:
 
 def test_mli_reserved_keys_conversion() -> None:
     """Verify that conversion from a string to an enum member
-    works as expected"""
+    works as expected."""
 
     for reserved_key in ReservedKeys:
         # iterate through all keys and verify `from_string` works
@@ -103,7 +103,7 @@ def test_mli_reserved_keys_conversion() -> None:
 
 def test_mli_reserved_keys_writes() -> None:
     """Verify that attempts to write to reserved keys are blocked from a
-    standard DragonFeatureStore but enabled with the BackboneFeatureStore"""
+    standard DragonFeatureStore but enabled with the BackboneFeatureStore."""
 
     mock_storage = {}
     dfs = DragonFeatureStore(mock_storage)
@@ -132,10 +132,11 @@ def test_mli_reserved_keys_writes() -> None:
 
 
 def test_mli_consumers_read_by_key() -> None:
-    """Verify that the value returned from the mli consumers
-    method is written to the correct key and reads are
-    allowed via standard dragon feature store.
-    NOTE: should reserved reads also be blocked"""
+    """Verify that the value returned from the mli consumers method is written
+    to the correct key and reads are allowed via standard dragon feature store.
+    
+    NOTE: should reserved reads also be blocked
+    """
 
     mock_storage = {}
     dfs = DragonFeatureStore(mock_storage)
@@ -154,7 +155,7 @@ def test_mli_consumers_read_by_key() -> None:
 
 def test_mli_consumers_read_by_backbone() -> None:
     """Verify that the backbone reads the correct location
-    when using the backbone feature store API instead of mapping API"""
+    when using the backbone feature store API instead of mapping API."""
 
     mock_storage = {}
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
@@ -168,7 +169,7 @@ def test_mli_consumers_read_by_backbone() -> None:
 
 def test_mli_consumers_write_by_backbone() -> None:
     """Verify that the backbone writes the correct location
-    when using the backbone feature store API instead of mapping API"""
+    when using the backbone feature store API instead of mapping API."""
 
     mock_storage = {}
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
@@ -182,10 +183,11 @@ def test_mli_consumers_write_by_backbone() -> None:
 
 def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None:
     """Verify that a broadcast operation without any registered subscribers
-    succeeds without raising Exceptions
+    succeeds without raising Exceptions.
 
     :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs"""
+    directories for individual test outputs
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     mock_storage = {}
     consumer_descriptor = storage_path / "test-consumer"
@@ -217,10 +219,11 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None:
 
 def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None:
     """Verify that a broadcast operation without any registered subscribers
-    succeeds without raising Exceptions
+    succeeds without raising Exceptions.
 
     :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs"""
+    directories for individual test outputs
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     mock_storage = {}
 
@@ -249,10 +252,11 @@ def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None:
 
 def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None:
     """Verify that a broadcast operation reports an error if no channel
-    factory was supplied for constructing the consumer channels
+    factory was supplied for constructing the consumer channels.
 
     :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs"""
+    directories for individual test outputs
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     mock_storage = {}
 
@@ -277,10 +281,11 @@ def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None
 
 def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None:
     """Verify that a successful broadcast clears messages from the event
-    buffer when a new message is sent and consumers are registered
+    buffer when a new message is sent and consumers are registered.
 
     :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs"""
+    directories for individual test outputs
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     mock_storage = {}
 
@@ -363,10 +368,11 @@ def test_eventpublisher_broadcast_returns_total_sent(
 
 
 def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None:
-    """Verify that any unused consumers are pruned each time a new event is sent
+    """Verify that any unused consumers are pruned each time a new event is sent.
 
     :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs"""
+    directories for individual test outputs
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     mock_storage = {}
 
@@ -429,12 +435,13 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None:
 def test_eventpublisher_serialize_failure(
     test_dir: str, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    """Verify that errors during message serialization are raised to the caller
+    """Verify that errors during message serialization are raised to the caller.
 
     :param test_dir: pytest fixture automatically generating unique working
     directories for individual test outputs
     :param monkeypatch: pytest fixture for modifying behavior of existing code
-    with mock implementations"""
+    with mock implementations
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     storage_path.mkdir(parents=True, exist_ok=True)
 
@@ -470,12 +477,13 @@ def bad_bytes(self) -> bytes:
 def test_eventpublisher_factory_failure(
     test_dir: str, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    """Verify that errors during channel construction are raised to the caller
+    """Verify that errors during channel construction are raised to the caller.
 
     :param test_dir: pytest fixture automatically generating unique working
     directories for individual test outputs
     :param monkeypatch: pytest fixture for modifying behavior of existing code
-    with mock implementations"""
+    with mock implementations
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     storage_path.mkdir(parents=True, exist_ok=True)
 
@@ -504,12 +512,13 @@ def boom(descriptor: str) -> None:
 
 def test_eventpublisher_failure(test_dir: str, monkeypatch: pytest.MonkeyPatch) -> None:
     """Verify that unexpected errors during message send are caught and wrapped in a
-    SmartSimError so they are not propagated directly to the caller
+    SmartSimError so they are not propagated directly to the caller.
 
     :param test_dir: pytest fixture automatically generating unique working
     directories for individual test outputs
     :param monkeypatch: pytest fixture for modifying behavior of existing code
-    with mock implementations"""
+    with mock implementations
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     storage_path.mkdir(parents=True, exist_ok=True)
 
@@ -544,10 +553,11 @@ def boom(self) -> None:
 
 
 def test_eventconsumer_receive(test_dir: str) -> None:
-    """Verify that a consumer retrieves a message from the given channel
+    """Verify that a consumer retrieves a message from the given channel.
 
     :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs"""
+    directories for individual test outputs
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     storage_path.mkdir(parents=True, exist_ok=True)
 
@@ -575,12 +585,13 @@ def test_eventconsumer_receive(test_dir: str) -> None:
 
 @pytest.mark.parametrize("num_sent", [0, 1, 2, 4, 8, 16])
 def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None:
-    """Verify that a consumer retrieves multiple message from the given channel
+    """Verify that a consumer retrieves multiple message from the given channel.
 
     :param test_dir: pytest fixture automatically generating unique working
     directories for individual test outputs
     :param num_sent: parameterized value used to vary the number of events
-    that are enqueued and validations are checked at multiple queue sizes"""
+    that are enqueued and validations are checked at multiple queue sizes
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     storage_path.mkdir(parents=True, exist_ok=True)
 
@@ -605,10 +616,11 @@ def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None:
 
 def test_eventconsumer_receive_empty(test_dir: str) -> None:
     """Verify that a consumer receiving an empty message ignores the
-    message and continues processing
+    message and continues processing.
 
     :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs"""
+    directories for individual test outputs
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     storage_path.mkdir(parents=True, exist_ok=True)
 
@@ -636,7 +648,8 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None:
     multiple publishers and consumers are sending simultaneously.
 
     :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs"""
+    directories for individual test outputs
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     storage_path.mkdir(parents=True, exist_ok=True)
 
@@ -722,7 +735,8 @@ def test_eventconsumer_batch_timeout(
 
     :param invalid_timeout: any invalid timeout that should fail validation
     :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs"""
+    directories for individual test outputs
+    """
     storage_path = pathlib.Path(test_dir) / "features"
     storage_path.mkdir(parents=True, exist_ok=True)
 
@@ -758,8 +772,12 @@ def test_eventconsumer_batch_timeout(
 def test_backbone_wait_timeout(wait_timeout: float, exp_wait_max: float) -> None:
     """Verify that attempts to attach to the worker queue from the protoclient
     timeout in an appropriate amount of time. Note: due to the backoff, we verify
-    the elapsed time is less than the 15s of a cycle of waits
+    the elapsed time is less than the 15s of a cycle of waits.
 
+    :param wait_timeout: Maximum amount of time (in seconds) to allow the backbone
+    to wait for the requested value to exist
+    :param exp_wait_max: Maximum amount of time (in seconds) to set as the upper
+    bound to allow the delays with backoff to occur
     :param storage_for_dragon_fs: the dragon storage engine to use
     """
 
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index fd93f9cfe..fb86ad7cd 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -62,7 +62,8 @@
 def storage_for_dragon_fs() -> t.Dict[str, str]:
     """Fixture to instantiate a dragon distributed dictionary.
 
-    NOTE: using module scoped fixtures drastically improves test run-time"""
+    NOTE: using module scoped fixtures drastically improves test run-time
+    """
     return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
 
 
@@ -71,7 +72,8 @@ def the_worker_channel() -> DragonCommChannel:
     """Fixture to create a valid descriptor for a worker channel
     that can be attached to.
 
-    NOTE: using module scoped fixtures drastically improves test run-time"""
+    NOTE: using module scoped fixtures drastically improves test run-time
+    """
     wmgr_channel_ = create_local()
     wmgr_channel = DragonCommChannel(wmgr_channel_)
     return wmgr_channel
@@ -82,7 +84,8 @@ def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore:
     """Fixture to create a distributed dragon dictionary and wrap it
     in a BackboneFeatureStore.
 
-    NOTE: using module scoped fixtures drastically improves test run-time"""
+    NOTE: using module scoped fixtures drastically improves test run-time
+    """
     return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
 
 
@@ -96,7 +99,8 @@ def test_eventconsumer_eventpublisher_integration(
 
     :param storage_for_dragon_fs: the dragon storage engine to use
     :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs"""
+    directories for individual test outputs
+    """
 
     mock_storage = storage_for_dragon_fs
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
@@ -278,7 +282,8 @@ def test_channel_buffer_size(
     until a configured maximum value is exceeded.
 
     :param buffer_size: the maximum number of messages allowed in a channel buffer
-    :param storage_for_dragon_fs: the dragon storage engine to use"""
+    :param storage_for_dragon_fs: the dragon storage engine to use
+    """
 
     mock_storage = storage_for_dragon_fs
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index 6fb44ed3d..86becf71e 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -73,7 +73,7 @@ def storage_for_dragon_fs() -> t.Dict[str, str]:
 def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
     """Fixture that creates a dragon backbone feature store.
 
-    :param storage_for_dragon_fs:
+    :param storage_for_dragon_fs: the distributed dictionary to use in backbone
     :returns: The backbone feature store
     :returns: The attached `BackboneFeatureStore`
     """
@@ -124,13 +124,14 @@ def test_protoclient_timeout(
 ):
     """Verify that attempts to attach to the worker queue from the protoclient
     timeout in an appropriate amount of time. Note: due to the backoff, we verify
-    the elapsed time is less than the 15s of a cycle of waits
+    the elapsed time is less than the 15s of a cycle of waits.
 
-    :param wait_timeout: a timeout for use when configuring a proto client
+    :param backbone_timeout: a timeout for use when configuring a proto client
     :param exp_wait_max: a ceiling for the expected time spent waiting for
     the timeout
     :param the_backbone: a pre-initialized backbone featurestore for setting up
-    the environment variable required by the client"""
+    the environment variable required by the client
+    """
 
     # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8]
     # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps)
@@ -179,7 +180,7 @@ def test_protoclient_initialization(
     monkeypatch: pytest.MonkeyPatch,
 ):
     """Verify that attempting to start the client with required env vars results
-    in a fully initialized client
+    in a fully initialized client.
 
     :param the_backbone: a pre-initialized backbone featurestore
     :param the_worker_queue: an FLI channel the client will retrieve
@@ -227,11 +228,13 @@ def test_protoclient_write_model(
     monkeypatch: pytest.MonkeyPatch,
 ):
     """Verify that writing a model using the client causes the model data to be
-    written to a feature store
+    written to a feature store.
 
     :param the_backbone: a pre-initialized backbone featurestore
-    :param the_worker_queue: an FLI channel the client will retrieve
-    from the backbone"""
+    :param the_worker_queue: Passing the worker queue fixture to ensure
+    the worker queue environment is correctly configured.
+    from the backbone
+    """
 
     with monkeypatch.context() as ctx:
         # we won't actually send here
@@ -262,13 +265,15 @@ def test_protoclient_write_model_notification_sent(
     num_listeners: int,
     num_model_updates: int,
 ):
-    """Verify that writing a model sends a key-written event
+    """Verify that writing a model sends a key-written event.
 
     :param the_backbone: a pre-initialized backbone featurestore
     :param the_worker_queue: an FLI channel the client will retrieve
     from the backbone
     :param num_listeners: vary the number of registered listeners
     to verify that the event is broadcast to everyone
+    :param num_listeners: vary the number of listeners to register
+    to verify the broadcast counts messages sent correctly
     """
 
     # we won't actually send here, but it won't try without registered listeners
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index 69d962940..132bb2110 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -136,7 +136,14 @@ def mock_messages(
     comm_channel_root_dir: pathlib.Path,
     kill_queue: mp.Queue,
 ) -> None:
-    """Mock event producer for triggering the inference pipeline"""
+    """Mock event producer for triggering the inference pipeline.
+
+    :param feature_store_root_dir: Path to a directory where a
+    FileSystemFeatureStore can read & write results
+    :param comm_channel_root_dir: Path to a directory where a
+    FileSystemCommChannel can read & write messages
+    :param kill_queue: Queue used by unit test to stop mock_message process
+    """
     feature_store_root_dir.mkdir(parents=True, exist_ok=True)
     comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
 
@@ -203,7 +210,7 @@ def mock_messages(
 
 def mock_mli_infrastructure_mgr() -> None:
     """Create resources normally instanatiated by the infrastructure
-    management portion of the DragonBackend
+    management portion of the DragonBackend.
     """
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -228,8 +235,9 @@ def mock_mli_infrastructure_mgr() -> None:
 def prepare_environment(test_dir: str) -> pathlib.Path:
     """Cleanup prior outputs to run demo repeatedly.
 
-    :param tes_dir: the directory to prepare
-    :returns: The path to the log file"""
+    :param test_dir: the directory to prepare
+    :returns: The path to the log file
+    """
     path = pathlib.Path(f"{test_dir}/workermanager.log")
     logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
     return path

From f5ba5a69a5ed671dc50fca916d27113b5cd6d722 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Fri, 27 Sep 2024 12:00:39 -0500
Subject: [PATCH 18/40] docstrings

---
 .../_core/launcher/dragon/dragonBackend.py     |  2 --
 smartsim/protoclient.py                        |  3 +--
 tests/dragon/test_error_handling.py            |  5 +----
 tests/dragon/test_featurestore.py              | 11 ++---------
 tests/dragon/test_featurestore_base.py         |  5 +----
 tests/dragon/test_featurestore_integration.py  | 18 ++++++------------
 6 files changed, 11 insertions(+), 33 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 6dc61516e..1d8c71e7d 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -81,8 +81,6 @@
 logger = get_logger(__name__)
 
 
-# TODO: create ticket for follow-up task to replace defunct
-# dragon_group_state.Running() & .Error()
 class DragonStatus(str, Enum):
     ERROR = "Error"
     RUNNING = "Running"
diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py
index a84a8a261..7f6d6f412 100644
--- a/smartsim/protoclient.py
+++ b/smartsim/protoclient.py
@@ -91,7 +91,6 @@ def _attach_to_backbone() -> BackboneFeatureStore:
 
         :returns: The attached backbone featurestore
         """
-        # todo: ensure this env var from config loader or constant
         descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None)
         if descriptor is None or not descriptor:
             raise SmartSimError(
@@ -154,7 +153,7 @@ def __init__(
         :raises: SmartSimError if unable to attach to a backbone featurestore
         """
         if MPI is not None:
-            # todo: determine a way to make MPI work in the test environment
+            # TODO: determine a way to make MPI work in the test environment
             #  - consider catching the import exception and defaulting rank to 0
             comm = MPI.COMM_WORLD
             rank: int = comm.Get_rank()
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 6f1e74dca..7d2c4cb3c 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -84,10 +84,7 @@
 @pytest.fixture(scope="module")
 def the_worker_channel() -> DragonFLIChannel:
     """Fixture to create a valid descriptor for a worker channel
-    that can be attached to.
-
-    NOTE: using module scoped fixtures drastically improves test run-time
-    """
+    that can be attached to."""
     channel_ = create_local()
     fli_ = FLInterface(main_ch=channel_, manager_ch=None)
     comm_channel = DragonFLIChannel(fli_, True)
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index ea62fbbeb..e815e0dd9 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -71,20 +71,14 @@
 
 @pytest.fixture(scope="module")
 def storage_for_dragon_fs() -> t.Dict[str, str]:
-    """Fixture to instantiate a dragon distributed dictionary.
-
-    NOTE: using module scoped fixtures drastically improves test run-time
-    """
+    """Fixture to instantiate a dragon distributed dictionary."""
     return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
 
 
 @pytest.fixture(scope="module")
 def the_worker_channel() -> DragonFLIChannel:
     """Fixture to create a valid descriptor for a worker channel
-    that can be attached to. Does not modify environment vars.
-
-    NOTE: using module scoped fixtures drastically improves test run-time
-    """
+    that can be attached to. Does not modify environment vars."""
     channel_ = create_local()
     fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
     comm_channel = DragonFLIChannel(fli_, True)
@@ -100,7 +94,6 @@ def the_backbone(
 
     :param storage_for_dragon_fs: the dragon storage engine to use
     :param the_worker_channel: a pre-configured worker channel
-    NOTE: using module scoped fixtures drastically improves test run-time
     """
 
     backbone = BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py
index 84594e3c2..2e032213b 100644
--- a/tests/dragon/test_featurestore_base.py
+++ b/tests/dragon/test_featurestore_base.py
@@ -133,10 +133,7 @@ def test_mli_reserved_keys_writes() -> None:
 
 def test_mli_consumers_read_by_key() -> None:
     """Verify that the value returned from the mli consumers method is written
-    to the correct key and reads are allowed via standard dragon feature store.
-    
-    NOTE: should reserved reads also be blocked
-    """
+    to the correct key and reads are allowed via standard dragon feature store."""
 
     mock_storage = {}
     dfs = DragonFeatureStore(mock_storage)
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index fb86ad7cd..470193597 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -60,20 +60,14 @@
 
 @pytest.fixture(scope="module")
 def storage_for_dragon_fs() -> t.Dict[str, str]:
-    """Fixture to instantiate a dragon distributed dictionary.
-
-    NOTE: using module scoped fixtures drastically improves test run-time
-    """
+    """Fixture to instantiate a dragon distributed dictionary."""
     return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
 
 
 @pytest.fixture(scope="module")
 def the_worker_channel() -> DragonCommChannel:
     """Fixture to create a valid descriptor for a worker channel
-    that can be attached to.
-
-    NOTE: using module scoped fixtures drastically improves test run-time
-    """
+    that can be attached to."""
     wmgr_channel_ = create_local()
     wmgr_channel = DragonCommChannel(wmgr_channel_)
     return wmgr_channel
@@ -84,7 +78,7 @@ def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore:
     """Fixture to create a distributed dragon dictionary and wrap it
     in a BackboneFeatureStore.
 
-    NOTE: using module scoped fixtures drastically improves test run-time
+    :param storage_for_dragon_fs: The dragon storage engine to use
     """
     return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
 
@@ -95,10 +89,10 @@ def test_eventconsumer_eventpublisher_integration(
     """Verify that the publisher and consumer integrate as expected when
     multiple publishers and consumers are sending simultaneously. This
     test closely tracks the test in tests/test_featurestore.py also named
-    test_eventconsumer_eventpublisher_integration but requires dragon entities
+    test_eventconsumer_eventpublisher_integration but requires dragon entities.
 
-    :param storage_for_dragon_fs: the dragon storage engine to use
-    :param test_dir: pytest fixture automatically generating unique working
+    :param storage_for_dragon_fs: The dragon storage engine to use
+    :param test_dir: Automatically generated unique working
     directories for individual test outputs
     """
 

From d68975423d245e7ed7430f7e2c873bad67e689e4 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Fri, 27 Sep 2024 15:20:30 -0500
Subject: [PATCH 19/40] parameterize ddict creation, add single ddict
 touchpoint util module, use fixtures for pytest ddict creation

---
 .../_core/launcher/dragon/dragonBackend.py    |  18 ++-
 .../storage/dragon_feature_store.py           |   9 +-
 .../mli/infrastructure/storage/dragon_util.py | 100 ++++++++++++++
 tests/dragon/test_dragon_backend.py           |   5 -
 tests/dragon/test_dragon_ddict_utils.py       | 123 ++++++++++++++++++
 tests/dragon/test_environment_loader.py       |  15 ++-
 tests/dragon/test_error_handling.py           |  16 ++-
 tests/dragon/test_featurestore.py             |  15 ++-
 tests/dragon/test_featurestore_base.py        |   6 -
 tests/dragon/test_featurestore_integration.py |  31 +++--
 tests/dragon/test_protoclient.py              |  12 +-
 tests/dragon/test_reply_building.py           |   1 -
 tests/dragon/test_request_dispatcher.py       |  12 +-
 tests/dragon/test_worker_manager.py           |  14 +-
 14 files changed, 310 insertions(+), 67 deletions(-)
 create mode 100644 smartsim/_core/mli/infrastructure/storage/dragon_util.py
 create mode 100644 tests/dragon/test_dragon_ddict_utils.py

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 1d8c71e7d..0c172365a 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -37,7 +37,7 @@
 
 # pylint: disable=import-error,C0302,R0915
 # isort: off
-import dragon.data.ddict.ddict as dragon_ddict
+
 import dragon.infrastructure.connection as dragon_connection
 import dragon.infrastructure.policy as dragon_policy
 import dragon.infrastructure.process_desc as dragon_process_desc
@@ -56,6 +56,7 @@
     EventConsumer,
     OnCreateConsumer,
 )
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 
 # pylint: enable=import-error
 # isort: on
@@ -157,6 +158,10 @@ class DragonBackend:
     by threads spawned by it.
     """
 
+    _DEFAULT_NUM_MGR_PER_NODE = 2
+    _DEFAULT_MEM_PER_NODE = 256 * 1024**2
+    """The default memory capacity to allocate for a feaure store node (in megabytes)"""
+
     def __init__(self, pid: int) -> None:
         self._pid = pid
         """PID of dragon executable which launched this server"""
@@ -553,11 +558,12 @@ def _create_backbone(self) -> BackboneFeatureStore:
         :returns: The descriptor of the backbone feature store
         """
         if self._backbone is None:
-            logger.info("Creating backbone storage DDict")
-            backbone_storage = dragon_ddict.DDict(
-                n_nodes=len(self._hosts), total_mem=len(self._hosts) * 1024**3
-            )  # todo: parametrize
-            logger.info("Created backbone storage DDict")
+            backbone_storage = create_ddict(
+                len(self._hosts),
+                self._DEFAULT_NUM_MGR_PER_NODE,
+                self._DEFAULT_MEM_PER_NODE,
+            )
+
             self._backbone = BackboneFeatureStore(
                 backbone_storage, allow_reserved_writes=True
             )
diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
index 4eeeac32f..ecc232f21 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
@@ -32,6 +32,10 @@
 
 # isort: on
 
+from smartsim._core.mli.infrastructure.storage.dragon_util import (
+    ddict_to_descriptor,
+    descriptor_to_ddict,
+)
 from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore
 from smartsim.error import SmartSimError
 from smartsim.log import get_logger
@@ -48,7 +52,7 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None:
         :param storage: A distributed dictionary to be used as the underlying
         storage mechanism of the feature store"""
         if isinstance(storage, dragon_ddict.DDict):
-            descriptor = str(storage.serialize())
+            descriptor = ddict_to_descriptor(storage)
         else:
             descriptor = "not-set"
 
@@ -99,7 +103,8 @@ def from_descriptor(
         """
         try:
             logger.debug(f"Attaching to FeatureStore with descriptor: {descriptor}")
-            return cls(dragon_ddict.DDict.attach(descriptor))
+            storage = descriptor_to_ddict(descriptor)
+            return cls(storage)
         except Exception as ex:
             raise SmartSimError(
                 f"Error creating dragon feature store from descriptor: {descriptor}"
diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_util.py b/smartsim/_core/mli/infrastructure/storage/dragon_util.py
new file mode 100644
index 000000000..fda89bba5
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/storage/dragon_util.py
@@ -0,0 +1,100 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# pylint: disable=import-error
+# isort: off
+import dragon.data.ddict.ddict as dragon_ddict
+
+# isort: on
+
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+def ddict_to_descriptor(ddict: dragon_ddict.DDict) -> str:
+    """Convert a DDict to a descriptor string.
+
+    :param ddict: The dragon dictionary to convert
+    :returns: The descriptor string
+    """
+    if ddict is None:
+        raise ValueError("DDict is not available to create a descriptor")
+
+    # unlike other dragon objects, the dictionary serializes to a string
+    # instead of bytes
+    return str(ddict.serialize())
+
+
+def descriptor_to_ddict(descriptor: str) -> dragon_ddict.DDict:
+    """Create and attach a new DDict instance given
+    the string-encoded descriptor.
+
+    :param descriptor: The descriptor of a dictionary to attach to
+    :returns: The attached dragon dictionary"""
+    return dragon_ddict.DDict.attach(descriptor)
+
+
+def create_ddict(
+    num_nodes: int, mgr_per_node: int, mem_per_node: int
+) -> dragon_ddict.DDict:
+    """Create a distributed dragon dictionary.
+
+    :param num_nodes: The number of distributed nodes to distribute the dictionary to.
+     At least one node is required.
+    :param mgr_per_node: The number of manager processes per node
+    :param mem_per_node: The amount of memory (in megabytes) to allocate per node. Total
+     memory available will be calculated as `num_nodes * node_mem`
+
+    :returns: The instantiated dragon dictionary
+    :raises ValueError: If invalid num_nodes is supplied
+    :raises ValueError: If invalid mem_per_node is supplied
+    :raises ValueError: If invalid mgr_per_node is supplied
+    """
+    if num_nodes < 1:
+        raise ValueError("A dragon dictionary must have at least 1 node")
+
+    if mgr_per_node < 1:
+        raise ValueError("A dragon dict requires at least 2 managers per ndode")
+
+    if mem_per_node < dragon_ddict.DDICT_MIN_SIZE:
+        raise ValueError(
+            "A dragon dictionary requires at least "
+            f"{dragon_ddict.DDICT_MIN_SIZE / 1024} MB"
+        )
+
+    mem_total = num_nodes * mem_per_node
+
+    logger.debug(
+        f"Creating dragon dictionary with {num_nodes} nodes, {mem_total} MB memory"
+    )
+
+    distributed_dict = dragon_ddict.DDict(num_nodes, mgr_per_node, total_mem=mem_total)
+    logger.debug(
+        "Successfully created dragon dictionary with "
+        f"{num_nodes} nodes, {mem_total} MB total memory"
+    )
+    return distributed_dict
diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py
index 8a48e0026..b56a92c5b 100644
--- a/tests/dragon/test_dragon_backend.py
+++ b/tests/dragon/test_dragon_backend.py
@@ -25,16 +25,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-import typing as t
-import unittest.mock as mock
 
 import pytest
 
 dragon = pytest.importorskip("dragon")
 
-from dragon.channels import Channel
-from dragon.data.ddict.ddict import DDict
-from dragon.fli import DragonFLIError, FLInterface
 
 from smartsim._core.launcher.dragon.dragonBackend import DragonBackend
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
diff --git a/tests/dragon/test_dragon_ddict_utils.py b/tests/dragon/test_dragon_ddict_utils.py
new file mode 100644
index 000000000..0df33e7a7
--- /dev/null
+++ b/tests/dragon/test_dragon_ddict_utils.py
@@ -0,0 +1,123 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+# isort: off
+import dragon.data.ddict.ddict as dragon_ddict
+
+# isort: on
+
+from smartsim._core.mli.infrastructure.storage import dragon_util
+from smartsim.log import get_logger
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+logger = get_logger(__name__)
+
+
+@pytest.fixture(scope="module")
+def the_storage() -> dragon_ddict.DDict:
+    """Verify that a descriptor is created."""
+    return dragon_util.create_ddict(1, 2, 3 * 1024**2)
+
+
+@pytest.mark.parametrize(
+    "num_nodes, num_managers, mem_per_node",
+    [
+        pytest.param(1, 1, 3 * 1024**2, id="3MB, Bare minimum allocation"),
+        pytest.param(2, 2, 128 * 1024**2, id="128 MB allocation, 2 nodes, 2 mgr"),
+        pytest.param(2, 1, 512 * 1024**2, id="512 MB allocation, 2 nodes, 1 mgr"),
+    ],
+)
+def test_dragon_storage_util_create_ddict(
+    num_nodes: int,
+    num_managers: int,
+    mem_per_node: int,
+):
+    """Verify that a dragon dictionary is successfully created.
+
+    :param num_nodes: Number of ddict nodes to attempt to create
+    :param num_managers: Number of managers per node to request
+    :param num_managers: Memory to allocate per node
+    """
+    ddict = dragon_util.create_ddict(num_nodes, num_managers, mem_per_node)
+
+    assert ddict is not None
+
+
+@pytest.mark.parametrize(
+    "num_nodes, num_managers, mem_per_node",
+    [
+        pytest.param(-1, 1, 3 * 1024**2, id="Negative Node Count"),
+        pytest.param(0, 1, 3 * 1024**2, id="Invalid Node Count"),
+        pytest.param(1, -1, 3 * 1024**2, id="Negative Mgr Count"),
+        pytest.param(1, 0, 3 * 1024**2, id="Invalid Mgr Count"),
+        pytest.param(1, 1, -3 * 1024**2, id="Negative Mem Per Node"),
+        pytest.param(1, 1, (3 * 1024**2) - 1, id="Invalid Mem Per Node"),
+        pytest.param(1, 1, 0 * 1024**2, id="No Mem Per Node"),
+    ],
+)
+def test_dragon_storage_util_create_ddict_validators(
+    num_nodes: int,
+    num_managers: int,
+    mem_per_node: int,
+):
+    """Verify that a dragon dictionary is successfully created.
+
+    :param num_nodes: Number of ddict nodes to attempt to create
+    :param num_managers: Number of managers per node to request
+    :param num_managers: Memory to allocate per node
+    """
+    with pytest.raises(ValueError):
+        dragon_util.create_ddict(num_nodes, num_managers, mem_per_node)
+
+
+def test_dragon_storage_util_get_ddict_descriptor(the_storage: dragon_ddict.DDict):
+    """Verify that a descriptor is created.
+
+    :param the_storage: A pre-allocated ddict
+    """
+    value = dragon_util.ddict_to_descriptor(the_storage)
+
+    assert isinstance(value, str)
+    assert len(value) > 0
+
+
+def test_dragon_storage_util_get_ddict_from_descriptor(the_storage: dragon_ddict.DDict):
+    """Verify that a ddict is created from a descriptor.
+
+    :param the_storage: A pre-allocated ddict
+    """
+    descriptor = dragon_util.ddict_to_descriptor(the_storage)
+
+    value = dragon_util.descriptor_to_ddict(descriptor)
+
+    assert value is not None
+    assert isinstance(value, dragon_ddict.DDict)
+    assert dragon_util.ddict_to_descriptor(value) == descriptor
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index 08a0c0135..9dd0255fe 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -28,8 +28,8 @@
 
 dragon = pytest.importorskip("dragon")
 
+import dragon.data.ddict.ddict as dragon_ddict
 import dragon.utils as du
-from dragon.data.ddict.ddict import DDict
 from dragon.fli import FLInterface
 
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
@@ -39,12 +39,19 @@
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim.error.errors import SmartSimError
 
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
 
 
+@pytest.fixture(scope="module")
+def the_storage() -> dragon_ddict.DDict:
+    """Fixture to instantiate a dragon distributed dictionary."""
+    return create_ddict(1, 2, 4 * 1024**2)
+
+
 @pytest.mark.parametrize(
     "content",
     [
@@ -107,10 +114,12 @@ def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
         config.get_queue()
 
 
-def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
+def test_environment_loader_backbone_load_dfs(
+    monkeypatch: pytest.MonkeyPatch, the_storage: dragon_ddict.DDict
+):
     """Verify the dragon feature store is loaded correctly by the
     EnvironmentConfigLoader to demonstrate featurestore_factory correctness."""
-    feature_store = DragonFeatureStore(DDict())
+    feature_store = DragonFeatureStore(the_storage)
     monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor)
 
     config = EnvironmentConfigLoader(
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 7d2c4cb3c..4f511a9c3 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -29,6 +29,8 @@
 
 import pytest
 
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
+
 dragon = pytest.importorskip("dragon")
 
 import multiprocessing as mp
@@ -81,6 +83,12 @@
 pytestmark = pytest.mark.dragon
 
 
+@pytest.fixture(scope="module")
+def the_storage() -> DDict:
+    """Fixture to instantiate a dragon distributed dictionary."""
+    return create_ddict(1, 2, 4 * 1024**2)
+
+
 @pytest.fixture(scope="module")
 def the_worker_channel() -> DragonFLIChannel:
     """Fixture to create a valid descriptor for a worker channel
@@ -92,17 +100,17 @@ def the_worker_channel() -> DragonFLIChannel:
 
 
 @pytest.fixture(scope="module")
-def backbone_descriptor() -> str:
+def backbone_descriptor(the_storage) -> str:
     # create a shared backbone featurestore
-    feature_store = DragonFeatureStore(DDict())
+    feature_store = DragonFeatureStore(the_storage)
     return feature_store.descriptor
 
 
 @pytest.fixture(scope="module")
-def app_feature_store() -> FeatureStore:
+def app_feature_store(the_storage) -> FeatureStore:
     # create a standalone feature store to mimic a user application putting
     # data into an application-owned resource (app should not access backbone)
-    app_fs = DragonFeatureStore(DDict())
+    app_fs = DragonFeatureStore(the_storage)
     return app_fs
 
 
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index e815e0dd9..35720fa9d 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -34,6 +34,8 @@
 
 import pytest
 
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
+
 dragon = pytest.importorskip("dragon")
 
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
@@ -50,7 +52,6 @@
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     time as bbtime,
 )
-from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
@@ -70,9 +71,9 @@
 
 
 @pytest.fixture(scope="module")
-def storage_for_dragon_fs() -> t.Dict[str, str]:
+def the_storage() -> t.Dict[str, str]:
     """Fixture to instantiate a dragon distributed dictionary."""
-    return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
+    return create_ddict(1, 2, 4 * 1024**2)
 
 
 @pytest.fixture(scope="module")
@@ -87,16 +88,16 @@ def the_worker_channel() -> DragonFLIChannel:
 
 @pytest.fixture(scope="module")
 def the_backbone(
-    storage_for_dragon_fs: t.Any, the_worker_channel: DragonFLIChannel
+    the_storage: t.Any, the_worker_channel: DragonFLIChannel
 ) -> BackboneFeatureStore:
     """Fixture to create a distributed dragon dictionary and wrap it
     in a BackboneFeatureStore.
 
-    :param storage_for_dragon_fs: the dragon storage engine to use
-    :param the_worker_channel: a pre-configured worker channel
+    :param the_storage: The dragon storage engine to use
+    :param the_worker_channel: Pre-configured worker channel
     """
 
-    backbone = BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
+    backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True)
     backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor
 
     return backbone
diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py
index 2e032213b..2278a0036 100644
--- a/tests/dragon/test_featurestore_base.py
+++ b/tests/dragon/test_featurestore_base.py
@@ -59,12 +59,6 @@
 RANDOMLY_SET_KEY = "_SOMETHING_ELSE"
 
 
-@pytest.fixture
-def storage_for_dragon_fs_with_req_queue() -> t.Dict[str, str]:
-    storage = {WORK_QUEUE_KEY: "12345", RANDOMLY_SET_KEY: "67890"}
-    return storage
-
-
 def boom(*args, **kwargs) -> None:
     """Helper function that blows up when used to mock up
     some other function."""
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index 470193597..e4d6bb9eb 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -43,7 +43,10 @@
     OnCreateConsumer,
     OnWriteFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict
+from smartsim._core.mli.infrastructure.storage.dragon_util import (
+    create_ddict,
+    dragon_ddict,
+)
 
 # isort: off
 from dragon.channels import Channel
@@ -59,9 +62,9 @@
 
 
 @pytest.fixture(scope="module")
-def storage_for_dragon_fs() -> t.Dict[str, str]:
+def the_storage() -> dragon_ddict.DDict:
     """Fixture to instantiate a dragon distributed dictionary."""
-    return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3)
+    return create_ddict(1, 2, 32 * 1024**2)
 
 
 @pytest.fixture(scope="module")
@@ -74,29 +77,29 @@ def the_worker_channel() -> DragonCommChannel:
 
 
 @pytest.fixture(scope="module")
-def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore:
+def the_backbone(the_storage: t.Any) -> BackboneFeatureStore:
     """Fixture to create a distributed dragon dictionary and wrap it
     in a BackboneFeatureStore.
 
-    :param storage_for_dragon_fs: The dragon storage engine to use
+    :param the_storage: The dragon storage engine to use
     """
-    return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
+    return BackboneFeatureStore(the_storage, allow_reserved_writes=True)
 
 
 def test_eventconsumer_eventpublisher_integration(
-    storage_for_dragon_fs: t.Any, test_dir: str
+    the_storage: t.Any, test_dir: str
 ) -> None:
     """Verify that the publisher and consumer integrate as expected when
     multiple publishers and consumers are sending simultaneously. This
     test closely tracks the test in tests/test_featurestore.py also named
     test_eventconsumer_eventpublisher_integration but requires dragon entities.
 
-    :param storage_for_dragon_fs: The dragon storage engine to use
+    :param the_storage: The dragon storage engine to use
     :param test_dir: Automatically generated unique working
     directories for individual test outputs
     """
 
-    mock_storage = storage_for_dragon_fs
+    mock_storage = the_storage
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
 
     # verify ability to write and read from ddict
@@ -190,7 +193,7 @@ def test_eventconsumer_max_dequeue(
     :param num_events: Total number of events to raise in the test
     :param batch_timeout: Maximum wait time (in seconds) for a message to be sent
     :param max_batches_expected: Maximum number of receives that should occur
-    :param storage_for_dragon_fs: Dragon storage engine to use
+    :param the_storage: Dragon storage engine to use
     """
 
     # create some consumers to receive messages
@@ -270,16 +273,16 @@ def test_eventconsumer_max_dequeue(
 )
 def test_channel_buffer_size(
     buffer_size: int,
-    storage_for_dragon_fs: t.Any,
+    the_storage: t.Any,
 ) -> None:
     """Verify that a channel used by an EventBroadcaster can buffer messages
     until a configured maximum value is exceeded.
 
-    :param buffer_size: the maximum number of messages allowed in a channel buffer
-    :param storage_for_dragon_fs: the dragon storage engine to use
+    :param buffer_size: Maximum number of messages allowed in a channel buffer
+    :param the_storage: The dragon storage engine to use
     """
 
-    mock_storage = storage_for_dragon_fs
+    mock_storage = the_storage
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
 
     wmgr_channel_ = create_local(buffer_size)  # <--- vary buffer size
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index 86becf71e..2e6d1dcc5 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -42,13 +42,13 @@
     EventBroadcaster,
     OnWriteFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
 
 # isort: off
 from dragon import fli
-from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
 
 # from ..ex..high_throughput_inference.mock_app import ProtoClient
 from smartsim.protoclient import ProtoClient
@@ -61,16 +61,16 @@
 
 
 @pytest.fixture(scope="module")
-def storage_for_dragon_fs() -> t.Dict[str, str]:
+def the_storage() -> DDict:
     """Fixture that creates a dragon distributed dictionary.
 
     :returns: The attached distributed dictionary
     """
-    return dragon_ddict.DDict(1, 2, 4 * 1024**2)
+    return create_ddict(1, 2, 32 * 1024**2)
 
 
 @pytest.fixture(scope="module")
-def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
+def the_backbone(the_storage) -> BackboneFeatureStore:
     """Fixture that creates a dragon backbone feature store.
 
     :param storage_for_dragon_fs: the distributed dictionary to use in backbone
@@ -78,7 +78,7 @@ def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore:
     :returns: The attached `BackboneFeatureStore`
     """
 
-    return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True)
+    return BackboneFeatureStore(the_storage, allow_reserved_writes=True)
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/dragon/test_reply_building.py b/tests/dragon/test_reply_building.py
index 063200dd6..48493b3c4 100644
--- a/tests/dragon/test_reply_building.py
+++ b/tests/dragon/test_reply_building.py
@@ -31,7 +31,6 @@
 dragon = pytest.importorskip("dragon")
 
 from smartsim._core.mli.infrastructure.control.worker_manager import build_failure_reply
-from smartsim._core.mli.infrastructure.worker.worker import InferenceReply
 
 if t.TYPE_CHECKING:
     from smartsim._core.mli.mli_schemas.response.response_capnp import Status
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index b6be86177..82f41e3db 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -68,6 +68,7 @@
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim.log import get_logger
 
@@ -85,9 +86,15 @@
     pass
 
 
+@pytest.fixture(scope="module")
+def the_storage() -> DDict:
+    """Fixture to instantiate a dragon distributed dictionary."""
+    return create_ddict(1, 2, 4 * 1024**2)
+
+
 @pytest.mark.parametrize("num_iterations", [4])
 def test_request_dispatcher(
-    msg_pump_factory: _MsgPumpFactory, num_iterations: int
+    msg_pump_factory: _MsgPumpFactory, num_iterations: int, the_storage: DDict
 ) -> None:
     """Test the request dispatcher batching and queueing system
 
@@ -99,8 +106,7 @@ def test_request_dispatcher(
     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
     to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, sender_supplied=True)
 
-    ddict = DDict(1, 2, 4 * 1024**2)
-    backbone_fs = BackboneFeatureStore(ddict, allow_reserved_writes=True)
+    backbone_fs = BackboneFeatureStore(the_storage, allow_reserved_writes=True)
 
     # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
     # or test environment may be unable to send messages w/queue
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index 132bb2110..a2df57f3b 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -31,6 +31,8 @@
 
 import pytest
 
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
+
 torch = pytest.importorskip("torch")
 dragon = pytest.importorskip("dragon")
 
@@ -39,7 +41,6 @@
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
 )
-from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import OutputDescriptor
 
 try:
     mp.set_start_method("dragon")
@@ -48,10 +49,8 @@
 
 import os
 
-import dragon.channels as dch
 import torch.nn as nn
 from dragon import fli
-from dragon.data.ddict.ddict import DDict
 
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.comm.channel.dragon_util import create_local
@@ -256,14 +255,9 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
 
     mgr_per_node = 1
     num_nodes = 2
-    mem_per_node = 1024**3
-    total_mem = num_nodes * mem_per_node
+    mem_per_node = 128 * 1024**2
 
-    storage = DDict(
-        managers_per_node=mgr_per_node,
-        n_nodes=num_nodes,
-        total_mem=total_mem,
-    )
+    storage = create_ddict(num_nodes, mgr_per_node, mem_per_node)
     backbone = BackboneFeatureStore(storage, allow_reserved_writes=True)
 
     to_worker_channel = create_local()

From 7ddcd7c7bca5e5bc02454c846a53cb6aaed2f5a4 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Fri, 27 Sep 2024 15:44:40 -0500
Subject: [PATCH 20/40] remove completed todos, fix docstrings, remove
 obsolete/commented code

---
 .../storage/dragon_feature_store.py           |   2 -
 tests/dragon/test_dragon_ddict_utils.py       |   2 +-
 tests/dragon/test_protoclient.py              |   1 -
 tests/mli/test_default_torch_worker.py        | 206 ------------------
 4 files changed, 1 insertion(+), 210 deletions(-)
 delete mode 100644 tests/mli/test_default_torch_worker.py

diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
index ecc232f21..7c640bab6 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
@@ -56,8 +56,6 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None:
         else:
             descriptor = "not-set"
 
-        # todo: follow up and ensure this descriptor is also encoded/decoded
-        # in a string-safe way here & in `from_descriptor`
         super().__init__(descriptor)
         self._storage: t.Dict[str, t.Union[str, bytes]] = storage
 
diff --git a/tests/dragon/test_dragon_ddict_utils.py b/tests/dragon/test_dragon_ddict_utils.py
index 0df33e7a7..d2240abc1 100644
--- a/tests/dragon/test_dragon_ddict_utils.py
+++ b/tests/dragon/test_dragon_ddict_utils.py
@@ -43,7 +43,7 @@
 
 @pytest.fixture(scope="module")
 def the_storage() -> dragon_ddict.DDict:
-    """Verify that a descriptor is created."""
+    """Fixture to instantiate a dragon distributed dictionary."""
     return dragon_util.create_ddict(1, 2, 3 * 1024**2)
 
 
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index 2e6d1dcc5..b871de267 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -144,7 +144,6 @@ def test_protoclient_timeout(
         elapsed = time.time() - start_time
         logger.info(f"ProtoClient timeout occurred in {elapsed} seconds")
 
-        # todo: should this trigger any wait if the backbone is set above?
         # confirm that we met our timeout
         assert (
             elapsed >= backbone_timeout
diff --git a/tests/mli/test_default_torch_worker.py b/tests/mli/test_default_torch_worker.py
deleted file mode 100644
index b2ec6c3dc..000000000
--- a/tests/mli/test_default_torch_worker.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# # BSD 2-Clause License
-# #
-# # Copyright (c) 2021-2024, Hewlett Packard Enterprise
-# # All rights reserved.
-# #
-# # Redistribution and use in source and binary forms, with or without
-# # modification, are permitted provided that the following conditions are met:
-# #
-# # 1. Redistributions of source code must retain the above copyright notice, this
-# #    list of conditions and the following disclaimer.
-# #
-# # 2. Redistributions in binary form must reproduce the above copyright notice,
-# #    this list of conditions and the following disclaimer in the documentation
-# #    and/or other materials provided with the distribution.
-# #
-# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# import io
-# import pathlib
-# import typing as t
-
-# import pytest
-# import torch
-
-# from smartsim._core.mli.infrastructure.worker.integratedtorchworker import (
-#     IntegratedTorchWorker,
-# )
-# import smartsim.error as sse
-# from smartsim._core.mli.infrastructure import MemoryFeatureStore
-# from smartsim._core.mli.infrastructure.worker.worker import (
-#     ExecuteResult,
-#     FetchInputResult,
-#     FetchModelResult,
-#     InferenceRequest,
-#     TransformInputResult,
-#     LoadModelResult,
-# )
-# from smartsim._core.utils import installed_redisai_backends
-
-# # The tests in this file belong to the group_a group
-# pytestmark = pytest.mark.group_b
-
-# # retrieved from pytest fixtures
-# is_dragon = pytest.test_launcher == "dragon"
-# torch_available = "torch" in installed_redisai_backends()
-
-
-# @pytest.fixture
-# def persist_torch_model(test_dir: str) -> pathlib.Path:
-#     test_path = pathlib.Path(test_dir)
-#     model_path = test_path / "basic.pt"
-
-#     model = torch.nn.Linear(2, 1)
-#     torch.save(model, model_path)
-
-#     return model_path
-
-
-# # def test_deserialize() -> None:
-# #     """Verify that serialized requests are properly deserialized to
-# #     and converted to the internal representation used by ML workers"""
-# #     worker = SampleTorchWorker
-# #     buffer = io.BytesIO()
-
-# #     exp_model_key = "model-key"
-# #     msg = InferenceRequest(model_key=exp_model_key)
-# #     pickle.dump(msg, buffer)
-
-# #     deserialized: InferenceRequest = worker.deserialize(buffer.getvalue())
-
-# #     assert deserialized.model_key == exp_model_key
-# #     # assert deserialized.backend == exp_backend
-
-
-# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
-# def test_load_model_from_disk(persist_torch_model: pathlib.Path) -> None:
-#     """Verify that a model can be loaded using a FileSystemFeatureStore"""
-#     worker = IntegratedTorchWorker
-#     request = InferenceRequest(raw_model=persist_torch_model.read_bytes())
-
-#     fetch_result = FetchModelResult(persist_torch_model.read_bytes())
-#     load_result = worker.load_model(request, fetch_result)
-
-#     input = torch.randn(2)
-#     pred = load_result.model(input)
-
-#     assert pred
-
-
-# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
-# def test_transform_input() -> None:
-#     """Verify that the default input transform operation is a no-op copy"""
-#     rows, cols = 1, 4
-#     num_values = 7
-#     tensors = [torch.randn((rows, cols)) for _ in range(num_values)]
-
-#     request = InferenceRequest()
-
-#     inputs: t.List[bytes] = []
-#     for tensor in tensors:
-#         buffer = io.BytesIO()
-#         torch.save(tensor, buffer)
-#         inputs.append(buffer.getvalue())
-
-#     fetch_result = FetchInputResult(inputs)
-#     worker = IntegratedTorchWorker
-#     result = worker.transform_input(request, fetch_result)
-#     transformed: t.Collection[torch.Tensor] = result.transformed
-
-#     assert len(transformed) == num_values
-
-#     for output, expected in zip(transformed, tensors):
-#         assert output.shape == expected.shape
-#         assert output.equal(expected)
-
-#     transformed = list(transformed)
-
-#     original: torch.Tensor = tensors[0]
-#     assert transformed[0].equal(original)
-
-#     # verify a copy was made
-#     transformed[0] = 2 * transformed[0]
-#     assert transformed[0].equal(2 * original)
-
-
-# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
-# def test_execute_model(persist_torch_model: pathlib.Path) -> None:
-#     """Verify that a model executes corrrectly via the worker"""
-
-#     # put model bytes into memory
-#     model_name = "test-key"
-#     feature_store = MemoryFeatureStore()
-#     feature_store[model_name] = persist_torch_model.read_bytes()
-
-#     worker = IntegratedTorchWorker
-#     request = InferenceRequest(model_key=model_name)
-#     fetch_result = FetchModelResult(persist_torch_model.read_bytes())
-#     load_result = worker.load_model(request, fetch_result)
-
-#     value = torch.randn(2)
-#     transform_result = TransformInputResult([value])
-
-#     execute_result = worker.execute(request, load_result, transform_result)
-
-#     assert execute_result.predictions is not None
-
-
-# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
-# def test_execute_missing_model(persist_torch_model: pathlib.Path) -> None:
-#     """Verify that a executing a model with an invalid key fails cleanly"""
-
-#     # use key that references an un-set model value
-#     model_name = "test-key"
-#     feature_store = MemoryFeatureStore()
-#     feature_store[model_name] = persist_torch_model.read_bytes()
-
-#     worker = IntegratedTorchWorker
-#     request = InferenceRequest(input_keys=[model_name])
-
-#     load_result = LoadModelResult(None)
-#     transform_result = TransformInputResult(
-#         [torch.randn(2), torch.randn(2), torch.randn(2)]
-#     )
-
-#     with pytest.raises(sse.SmartSimError) as ex:
-#         worker.execute(request, load_result, transform_result)
-
-#     assert "Model must be loaded" in ex.value.args[0]
-
-
-# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
-# def test_transform_output() -> None:
-#     """Verify that the default output transform operation is a no-op copy"""
-#     rows, cols = 1, 4
-#     num_values = 7
-#     inputs = [torch.randn((rows, cols)) for _ in range(num_values)]
-#     exp_outputs = [torch.Tensor(tensor) for tensor in inputs]
-
-#     worker = SampleTorchWorker
-#     request = InferenceRequest()
-#     exec_result = ExecuteResult(inputs)
-
-#     result = worker.transform_output(request, exec_result)
-
-#     assert len(result.outputs) == num_values
-
-#     for output, expected in zip(result.outputs, exp_outputs):
-#         assert output.shape == expected.shape
-#         assert output.equal(expected)
-
-#     transformed = list(result.outputs)
-
-#     # verify a copy was made
-#     original: torch.Tensor = inputs[0]
-#     transformed[0] = 2 * transformed[0]
-
-#     assert transformed[0].equal(2 * original)

From 762937c8017961a66ce33a95eb0aaaf60dd3501d Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Mon, 30 Sep 2024 22:46:14 -0500
Subject: [PATCH 21/40] extract notify listener from dragon backend, fix dragon
 import order, fix

---
 smartsim/_core/entrypoints/service.py         |  48 ++-
 .../_core/launcher/dragon/dragonBackend.py    | 109 +++---
 .../infrastructure/control/event_listener.py  | 318 ++++++++++++++++++
 .../storage/backbone_feature_store.py         |  44 ++-
 .../storage/dragon_feature_store.py           |  13 +-
 .../infrastructure/storage/feature_store.py   |   4 +-
 tests/dragon/test_dragon_backend.py           | 294 +++++++++++++---
 tests/dragon/test_error_handling.py           |   3 +-
 tests/dragon/test_featurestore.py             |   3 +-
 tests/dragon/test_worker_manager.py           |  10 +-
 10 files changed, 704 insertions(+), 142 deletions(-)
 create mode 100644 smartsim/_core/mli/infrastructure/control/event_listener.py

diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py
index 6b4ef74b6..27d541312 100644
--- a/smartsim/_core/entrypoints/service.py
+++ b/smartsim/_core/entrypoints/service.py
@@ -40,14 +40,22 @@ class Service(ABC):
     hooks for status changes"""
 
     def __init__(
-        self, as_service: bool = False, cooldown: int = 0, loop_delay: int = 0
+        self,
+        as_service: bool = False,
+        cooldown: int = 0,
+        loop_delay: int = 0,
+        health_check_frequency: float = 0,
     ) -> None:
         """Initialize the ServiceHost
+
         :param as_service: Determines if the host will run until shutdown criteria
         are met or as a run-once instance
         :param cooldown: Period of time to allow service to run before automatic
         shutdown, in seconds. A non-zero, positive integer.
-        :param loop_delay: delay between iterations of the event loop"""
+        :param loop_delay: Delay between iterations of the event loop (in seconds)
+        :param health_check_frequency: Delay between calls to a
+        health check handler (in seconds)
+        """
         self._as_service = as_service
         """If the service should run until shutdown function returns True"""
         self._cooldown = abs(cooldown)
@@ -55,6 +63,11 @@ def __init__(
         before shutdown"""
         self._loop_delay = abs(loop_delay)
         """Forced delay between iterations of the event loop"""
+        self._health_check_frequency = health_check_frequency
+        """The time (in seconds) between desired health checks. A health check
+        frequency of zero will never trigger the health check."""
+        self._last_health_check = time.time()
+        """The timestamp of the latest health check"""
 
     @abstractmethod
     def _on_iteration(self) -> None:
@@ -76,6 +89,11 @@ def _on_shutdown(self) -> None:
         the main event loop during automatic shutdown."""
         logger.debug(f"Shutting down {self.__class__.__name__}")
 
+    def _on_health_check(self) -> None:
+        """Empty hook method for use by subclasses. Invoked based on the
+        value of `self._health_check_frequency`."""
+        logger.debug(f"Performing health check for {self.__class__.__name__}")
+
     def _on_cooldown_elapsed(self) -> None:
         """Empty hook method for use by subclasses. Called on every event loop
         iteration immediately upon exceeding the cooldown period"""
@@ -98,13 +116,30 @@ def execute(self) -> None:
         """The main event loop of a service host. Evaluates shutdown criteria and
         combines with a cooldown period to allow automatic service termination.
         Responsible for executing calls to subclass implementation of `_on_iteration`"""
-        self._on_start()
+
+        try:
+            self._on_start()
+        except Exception:
+            logger.exception("Unable to start service.")
+            return
 
         running = True
         cooldown_start: t.Optional[datetime.datetime] = None
 
         while running:
-            self._on_iteration()
+            try:
+                self._on_iteration()
+            except Exception:
+                running = False
+                logger.exception(
+                    "Failure in event loop resulted in service termination"
+                )
+
+            if self._health_check_frequency > 0:
+                hc_elapsed = time.time() - self._last_health_check
+                if hc_elapsed >= self._health_check_frequency:
+                    self._on_health_check()
+                    self._last_health_check = time.time()
 
             # allow immediate shutdown if not set to run as a service
             if not self._as_service:
@@ -133,4 +168,7 @@ def execute(self) -> None:
                 self._on_delay()
                 time.sleep(self._loop_delay)
 
-        self._on_shutdown()
+        try:
+            self._on_shutdown()
+        except Exception:
+            logger.exception("Service shutdown may not have completed.")
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 0c172365a..fa28f8690 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -27,6 +27,7 @@
 import functools
 import itertools
 import os
+import socket
 import time
 import typing as t
 from dataclasses import dataclass, field
@@ -47,16 +48,15 @@
 import dragon.native.machine as dragon_machine
 
 from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter
-from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
-from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.control.event_listener import (
+    ConsumerRegistrationListener,
+)
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
-    EventBase,
     EventCategory,
-    EventConsumer,
-    OnCreateConsumer,
 )
 from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
+from smartsim.error.errors import SmartSimError
 
 # pylint: enable=import-error
 # isort: on
@@ -199,11 +199,9 @@ def __init__(self, pid: int) -> None:
         """Time in seconds needed by the server to complete shutdown"""
         self._backbone: t.Optional[BackboneFeatureStore] = None
         """The backbone feature store"""
-        self._event_consumer: t.Optional[EventConsumer] = None
-        """A consumer registered to listen for new consumers and update the shared
-        consumer registrations list"""
+        self._listener: t.Optional[dragon_process.Process] = None
+        """The standalone process executing the event consumer"""
 
-        """An event consumer for receiving events from MLI resources"""
         self._nodes: t.List["dragon_machine.Node"] = []
         """Node capability information for hosts in the allocation"""
         self._hosts: t.List[str] = []
@@ -573,20 +571,6 @@ def _create_backbone(self) -> BackboneFeatureStore:
 
         return self._backbone
 
-    def _on_consumer_created(self, event: EventBase) -> None:
-        """Event handler for updating the backbone when new event consumers
-        are registered.
-
-        :param event: The event that was received
-        """
-        if isinstance(event, OnCreateConsumer) and self._backbone is not None:
-            notify_list = set(self._backbone.notification_channels)
-            notify_list.add(event.descriptor)
-            self._backbone.notification_channels = list(notify_list)
-            return
-
-        logger.warning(f"Unhandled event received: {event}")
-
     @staticmethod
     def _initialize_cooldown() -> int:
         """Load environment configuration and determine the correct cooldown
@@ -601,47 +585,38 @@ def _initialize_cooldown() -> int:
             else 5
         )
 
-    def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer:
-        """
-        Create an event publisher and event consumer for communicating with
-        other MLI resources.
-
-        :param backbone: The backbone feature store used by the MLI backend.
-
-        NOTE: the backbone must be initialized before connecting to eventing clients.
-
-        :returns: The newly created EventConsumer instance
-        """
-
-        if self._event_consumer is None:
-            logger.info("Creating event consumer")
-            dragon_channel = create_local(500)
-            event_channel = DragonCommChannel(dragon_channel)
-            consumer = EventConsumer(
-                event_channel,
-                backbone,
-                [EventCategory.CONSUMER_CREATED],
-                name="BackendConsumerRegistrar",
-                event_handler=self._on_consumer_created,
-            )
-
-            self._event_consumer = consumer
-            backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = consumer.descriptor
-            logger.info(f"Backend consumer `{consumer.name}` created.")
-
-        return self._event_consumer
-
-    def listen_to_registrations(self, timeout: float = 0.001) -> None:
-        """Execute the listener for registration events.
+    def start_event_listener(
+        self, cpu_affinity: list[int], gpu_affinity: list[int]
+    ) -> dragon_process.Process:
+        if self._backbone is None:
+            raise SmartSimError("Backbone feature store is not available")
 
-        :param timeout: Maximum time to wait (in seconds) for a new event"""
-        if self._event_consumer is not None:
-            self._event_consumer.listen_once(timeout)
+        service = ConsumerRegistrationListener(
+            self._backbone, 1.0, 2.0, [EventCategory.CONSUMER_CREATED], True
+        )
 
-    @staticmethod
-    def _start_eventing_listeners() -> None:
-        # todo: start external listener entrypoint
-        ...
+        options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
+        local_policy = dragon_policy.Policy(
+            placement=dragon_policy.Policy.Placement.HOST_NAME,
+            host_name=socket.gethostname(),
+            cpu_affinity=cpu_affinity,
+            gpu_affinity=gpu_affinity,
+        )
+        process = dragon_process.Process(
+            target=service.execute,
+            args=[],
+            cwd=os.getcwd(),
+            env={
+                **os.environ,
+                **(self._backbone.get_env() if self._backbone is not None else {}),
+            },
+            policy=local_policy,
+            options=options,
+            stderr=dragon_process.Popen.STDOUT,
+            stdout=dragon_process.Popen.STDOUT,
+        )
+        process.start()
+        return process
 
     @staticmethod
     def create_run_policy(
@@ -684,8 +659,6 @@ def create_run_policy(
 
     def _start_steps(self) -> None:
         self._heartbeat()
-        backbone = self._create_backbone()
-        self._create_eventing(backbone)
 
         with self._queue_lock:
             started = []
@@ -713,7 +686,7 @@ def _start_steps(self) -> None:
                         env={
                             **request.current_env,
                             **request.env,
-                            **backbone.get_env(),
+                            **(self._backbone.get_env() if self._backbone else {}),
                         },
                         stdout=dragon_process.Popen.PIPE,
                         stderr=dragon_process.Popen.PIPE,
@@ -869,8 +842,7 @@ def _should_print_status(self) -> bool:
 
     def _update(self) -> None:
         """Trigger all update queries and update local state database"""
-        backbone = self._create_backbone()
-        self._create_eventing(backbone)
+        self._create_backbone()
 
         self._stop_steps()
         self._start_steps()
@@ -879,6 +851,9 @@ def _update(self) -> None:
 
     def _kill_all_running_jobs(self) -> None:
         with self._queue_lock:
+            if self._listener and self._listener.is_alive:
+                self._listener.kill()
+
             for step_id, group_info in self._group_infos.items():
                 if group_info.status not in TERMINAL_STATUSES:
                     self._stop_requests.append(DragonStopRequest(step_id=step_id))
diff --git a/smartsim/_core/mli/infrastructure/control/event_listener.py b/smartsim/_core/mli/infrastructure/control/event_listener.py
new file mode 100644
index 000000000..03d7b1ceb
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/event_listener.py
@@ -0,0 +1,318 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# isort: off
+# pylint: disable=import-error
+# pylint: disable=unused-import
+import dragon
+
+# from dragon.globalservices.api_setup import connect_to_infrastructure
+
+
+# pylint: enable=unused-import
+# pylint: enable=import-error
+# isort: on
+
+import argparse
+import multiprocessing as mp
+import os
+import sys
+import typing as t
+
+from smartsim._core.entrypoints.service import Service
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+    EventBase,
+    EventCategory,
+    EventConsumer,
+    OnCreateConsumer,
+)
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class ConsumerRegistrationListener(Service):
+    """A long-running service that listens for events of a specific type
+    and executes the appropriate event handler."""
+
+    def __init__(
+        self,
+        backbone: BackboneFeatureStore,
+        timeout: float,
+        batch_timeout: float,
+        event_filters: t.List[EventCategory],
+        as_service: bool = False,
+        cooldown: int = 0,
+        health_check_frequency: float = 60.0,
+    ) -> None:
+        """Initialize the EventListener.
+
+        :param backbone: The backbone feature store
+        :param timeout: Maximum time (in seconds) to allow a single recv request to wait
+        :param batch_timeout: Maximum time (in seconds) to allow a batch of receives to
+         continue to build
+        :param filters: Filters specifying the message types to handle
+        :param as_service: Specifies run-once or run-until-complete behavior of service
+        :param cooldown: Number of seconds to wait before shutting down after
+        shutdown criteria are met
+        """
+        super().__init__(
+            as_service, cooldown, health_check_frequency=health_check_frequency
+        )
+
+        self._timeout = timeout
+        """ Maximum time (in seconds) to allow a single recv request to wait"""
+
+        self._batch_timeout = batch_timeout
+        """Maximum time (in seconds) to allow a batch of receives to
+         continue to build"""
+
+        self._filters = event_filters
+        """Filters specifying the message types to handle"""
+
+        self._consumer: t.Optional[EventConsumer] = None
+        """The event consumer that handles receiving events"""
+
+        self._backbone = backbone
+        """A standalone, system-created feature store used to share internal
+        information among MLI components"""
+
+    def _on_start(self) -> None:
+        """Called on initial entry into Service `execute` event loop before
+        `_on_iteration` is invoked."""
+        super()._on_start()
+        self._create_eventing()
+
+    def _on_shutdown(self) -> None:
+        """Release dragon resources. Called immediately after exiting
+        the main event loop during automatic shutdown."""
+        super()._on_shutdown()
+
+        # unregister this listener in the backbone
+        self._backbone.pop(BackboneFeatureStore.MLI_BACKEND_CONSUMER)
+
+    def _on_iteration(self) -> None:
+        """Executes calls to the machine learning worker implementation to complete
+        the inference pipeline."""
+
+        if self._consumer is None:
+            logger.info("Unable to listen. No consumer available.")
+            return
+
+        self._consumer.listen_once(self._timeout, self._batch_timeout)
+
+    def _can_shutdown(self) -> bool:
+        """Determines if the event consumer is ready to stop listening.
+
+        :returns: True when criteria to shutdown the service are met, False otherwise
+        """
+
+        if self._backbone is None:
+            logger.info("Listener must shutdown: no backbone attached")
+            return True
+
+        if self._consumer is None:
+            logger.info("Listener must shutdown: no consumer channel created")
+            return True
+
+        if not self._consumer.listening:
+            logger.info("Listener can shutdown: consumer is not listening")
+            return True
+
+        return False
+
+    def _on_event_received(self, event: EventBase) -> None:
+        """Event handler for updating the backbone when new event consumers
+        are registered.
+
+        :param event: The event that was received
+        """
+        if self._backbone is None:
+            logger.info("Unable to handle event. Backbone is missing.")
+
+        if not isinstance(event, OnCreateConsumer):
+            logger.info(
+                "Consumer registration listener received an "
+                f"unexpected event: {event=}"
+            )
+            return
+
+        notify_list = set(self._backbone.notification_channels)
+        notify_list.add(event.descriptor)
+        self._backbone.notification_channels = list(notify_list)
+
+    def _on_health_check(self) -> None:
+        """Check if this consumer has been replaced by a new listener
+        and automatically trigger a shutdown. Invoked based on the
+        value of `self._health_check_frequency`."""
+        super()._on_health_check()
+
+        try:
+            logger.debug("Retrieving registered listener descriptor")
+            descriptor = self._backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+        except KeyError:
+            descriptor = None
+            if self._consumer:
+                self._consumer.listening = False
+
+        if self._consumer and descriptor != self._consumer.descriptor:
+            logger.warning(
+                "This listener is no longer registered. It "
+                "will automatically shut down."
+            )
+            self._consumer.listening = False
+
+    def _publish_consumer(self) -> None:
+        """Publish the consumer descriptor to the backbone."""
+        if self._consumer is None:
+            logger.warning("No consumer descriptor available to publisher")
+            return
+
+        self._backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = (
+            self._consumer.descriptor
+        )
+
+    def _create_eventing(self) -> EventConsumer:
+        """
+        Create an event publisher and event consumer for communicating with
+        other MLI resources.
+
+        :param backbone: The backbone feature store used by the MLI backend.
+
+        NOTE: the backbone must be initialized before connecting eventing clients.
+
+        :returns: The newly created EventConsumer instance
+        """
+
+        if self._consumer:
+            return self._consumer
+
+        logger.info("Creating event consumer")
+
+        dragon_channel = create_local(500)
+        event_channel = DragonCommChannel(dragon_channel)
+
+        if not event_channel.descriptor:
+            raise SmartSimError(
+                "Unable to generate the descriptor for the event channel"
+            )
+
+        self._consumer = EventConsumer(
+            event_channel,
+            self._backbone,
+            self._filters,
+            name="BackendConsumerRegistrar",
+            event_handler=self._on_event_received,
+        )
+        self._publish_consumer()
+
+        logger.info(
+            f"Backend consumer `{self._consumer.name}` created: "
+            f"{self._consumer.descriptor}"
+        )
+
+        return self._consumer
+
+
+def _create_parser() -> argparse.ArgumentParser:
+    """
+    Create an argument parser that contains the arguments
+    required to start the listener as a new process:
+
+      --timeout
+      --batch_timeout
+      --categories
+
+    :returns: A configured parser
+    """
+    arg_parser = argparse.ArgumentParser(prog="ConsumerRegistrarEventListener")
+
+    category_default = EventCategory.CONSUMER_CREATED
+
+    arg_parser.add_argument("--timeout", type=float, default=1.0)
+    arg_parser.add_argument("--batch_timeout", type=float, default=1.0)
+    arg_parser.add_argument("--categories", type=str, default=category_default)
+
+    return arg_parser
+
+
+def _connect_backbone() -> t.Optional[BackboneFeatureStore]:
+    """
+    Load the backbone by retrieving the descriptor from environment variables.
+
+    :returns: The backbone feature store
+    :raises: SmartSimError if a descriptor is not found
+    """
+    descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, "")
+
+    if not descriptor:
+        return None
+
+    logger.info(f"Listener backbone descriptor: {descriptor}\n")
+
+    # `from_writable_descriptor` ensures we can update the backbone
+    return BackboneFeatureStore.from_writable_descriptor(descriptor)
+
+
+if __name__ == "__main__":
+    mp.set_start_method("dragon")
+
+    parser = _create_parser()
+
+    args = parser.parse_args()
+    user_filters: t.List[EventCategory] = list(args.categories.split(","))
+
+    backbone_fs = _connect_backbone()
+
+    if backbone_fs is None:
+        logger.error(
+            "Unable to attach to the backbone without the "
+            f"`{BackboneFeatureStore.MLI_BACKBONE}` environment variable."
+        )
+        sys.exit(1)
+
+    logger.debug(f"Listener attached to backbone: {backbone_fs.descriptor}")
+
+    listener = ConsumerRegistrationListener(
+        backbone_fs,
+        float(args.timeout),
+        float(args.batch_timeout),
+        user_filters,
+        as_service=True,
+    )
+
+    logger.info(f"listener created? {listener}")
+
+    try:
+        listener.execute()
+        sys.exit(0)
+    except Exception:
+        logger.exception("An error occurred in the event listener")
+        sys.exit(1)
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index 9fcf490e4..ffeb917a9 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -76,8 +76,7 @@ def __init__(
         super().__init__(storage)
         self._enable_reserved_writes = allow_reserved_writes
 
-        if self._CREATED_ON not in self:
-            self._record_creation_data()
+        self._record_creation_data()
 
     @property
     def wait_timeout(self) -> float:
@@ -114,7 +113,9 @@ def notification_channels(self, values: t.Sequence[str]) -> None:
 
         :param values: The list of channel descriptors to save
         """
-        self[self.MLI_NOTIFY_CONSUMERS] = ",".join([str(value) for value in values])
+        self[self.MLI_NOTIFY_CONSUMERS] = ",".join(
+            [str(value) for value in values if value]
+        )
 
     @property
     def backend_channel(self) -> t.Optional[str]:
@@ -198,7 +199,8 @@ def _check_wait_timeout(
         elapsed = time.time() - start_time
         if timeout and elapsed > timeout:
             raise SmartSimError(
-                f"Backbone {self.descriptor=} timeout retrieving all keys: {indicators}"
+                f"Backbone {self.descriptor=} timeout after {elapsed} "
+                f"seconds retrieving keys: {indicators}"
             )
 
     def wait_for(
@@ -260,6 +262,8 @@ class EventCategory(str, enum.Enum):
     """Event category for an event raised when a new consumer is created"""
     FEATURE_STORE_WRITTEN: str = "feature-store-written"
     """Event category for an event raised when a feature store key is written"""
+    SHUTDOWN: str = "shutdown"
+    """Event category for an event that should trigger the listener to shutdown"""
 
 
 @dataclass
@@ -288,6 +292,14 @@ def __str__(self) -> str:
         return f"{self.uid}|{self.category}"
 
 
+class OnShutdownRequested(EventBase):
+    """Publish this event to trigger the listener to shutdown."""
+
+    def __init__(self) -> None:
+        """Initialize the OnShutdownRequest event."""
+        super().__init__(EventCategory.SHUTDOWN, str(uuid.uuid4()))
+
+
 class OnCreateConsumer(EventBase):
     """Publish this event when a new event consumer registration is required."""
 
@@ -593,6 +605,7 @@ def __init__(
         self._global_filters = filters or []
         self._name = name
         self._event_handler = event_handler
+        self.listening = True
 
     @property
     def descriptor(self) -> str:
@@ -696,7 +709,10 @@ def register(self) -> None:
             logger.warning("Unable to register. No registrar channel found.")
 
     def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None:
-        """Receives messages for the consumer a single time.
+        """Receives messages for the consumer a single time. Delivers
+        all messages that pass the consumer filters. Shutdown requests
+        are handled by a default event handler.
+
 
         NOTE: Executes a single batch-retrieval to receive the maximum
         number of messages available under batch timeout. To continually
@@ -715,5 +731,23 @@ def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> Non
 
         for message in incoming_messages:
             logger.debug(f"Sending event {message=} to handler.")
+            self._handle_shutdown(message)
             if self._event_handler:
                 self._event_handler(message)
+
+    def _handle_shutdown(self, event: EventBase) -> None:
+        """Handles shutdown requests sent to the consumer by setting the
+        `self.listener` property to `False`."""
+        if isinstance(event, OnShutdownRequested):
+            self.listening = False
+
+    def listen(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None:
+        """Receives messages for the consumer until a shutdown request is received
+
+        :param timeout: Maximum time to wait (in seconds) for a message to arrive
+        :param timeout: Maximum time to wait (in seconds) for a batch to arrive
+        """
+        self.listening = True
+
+        while self.listening:
+            self.listen_once(timeout, batch_timeout)
diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
index 7c640bab6..c8c85623f 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
@@ -68,7 +68,7 @@ def _get(self, key: str) -> t.Union[str, bytes]:
         """
         try:
             return self._storage[key]
-        except KeyError as e:
+        except dragon_ddict.DDictKeyError as e:
             raise KeyError(f"Key not found in FeatureStore: {key}") from e
 
     def _set(self, key: str, value: t.Union[str, bytes]) -> None:
@@ -88,6 +88,17 @@ def _contains(self, key: str) -> bool:
         """
         return key in self._storage
 
+    def pop(self, key: str) -> t.Union[str, bytes, None]:
+        """Remove the value from the dictionary and return the value.
+
+        :param key: Dictionary key to retrieve
+        :returns: The value held at the key if it exists, otherwise `None
+        `"""
+        try:
+            return self._storage.pop(key)
+        except dragon_ddict.DDictKeyError:
+            return None
+
     @classmethod
     def from_descriptor(
         cls,
diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py
index 8c85a352d..260b1a337 100644
--- a/smartsim/_core/mli/infrastructure/storage/feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py
@@ -147,8 +147,8 @@ def __getitem__(self, key: str) -> t.Union[str, bytes]:
         """
         try:
             return self._get(key)
-        except KeyError as ex:
-            raise SmartSimError(f"An unknown key was requested: {key}") from ex
+        except KeyError:
+            raise
         except Exception as ex:
             # note: explicitly avoid round-trip to check for key existence
             raise SmartSimError(
diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py
index b56a92c5b..003e27e8c 100644
--- a/tests/dragon/test_dragon_backend.py
+++ b/tests/dragon/test_dragon_backend.py
@@ -25,6 +25,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import time
+import uuid
 
 import pytest
 
@@ -33,12 +35,13 @@
 
 from smartsim._core.launcher.dragon.dragonBackend import DragonBackend
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.infrastructure.control.event_listener import (
+    ConsumerRegistrationListener,
+)
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
-    EventBase,
-    EventBroadcaster,
-    EventConsumer,
     OnCreateConsumer,
+    OnShutdownRequested,
 )
 from smartsim.log import get_logger
 
@@ -47,71 +50,258 @@
 logger = get_logger(__name__)
 
 
-def test_dragonbackend_listener_boostrapping(monkeypatch: pytest.MonkeyPatch):
-    """Verify that the dragon backend registration channel correctly
-    registers new consumers in the backbone and begins sending events
-    to the new consumers."""
+def test_dragonbackend_start_listener():
+    """Verify the background process listening to consumer registration events
+    is up and processing messages as expected."""
+    backend = DragonBackend(pid=9999)
+
+    # We need to let the backend create the backbone to continue
+    backbone = backend._create_backbone()
+    backbone.pop(BackboneFeatureStore.MLI_BACKEND_CONSUMER)
+
+    os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor
+
+    with pytest.raises(KeyError) as ex:
+        # we expect the value of the consumer to be empty until
+        # the listener start-up completes.
+        backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+
+    assert "not found" in ex.value.args[0]
+
+    drg_process = backend.start_event_listener(cpu_affinity=[], gpu_affinity=[])
+
+    # # confirm there is a process still running
+    logger.info(f"Dragon process started: {drg_process}")
+    assert drg_process is not None, "Backend was unable to start event listener"
+    assert drg_process.puid != 0, "Process unique ID is empty"
+    assert drg_process.returncode is None, "Listener terminated early"
+
+    # wait for the event listener to come up
+    try:
+        config = backbone.wait_for(
+            [BackboneFeatureStore.MLI_BACKEND_CONSUMER], timeout=30
+        )
+        # verify result was in the returned configuration map
+        assert config[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+    except Exception:
+        raise KeyError(
+            f"Unable to locate {BackboneFeatureStore.MLI_BACKEND_CONSUMER}"
+            "in the backbone"
+        )
+
+    # wait_for ensures the normal retrieval will now work, error-free
+    descriptor = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+    assert descriptor is not None
+
+    # register a new listener channel
+    comm_channel = DragonCommChannel.from_descriptor(descriptor)
+    mock_descriptor = str(uuid.uuid4())
+    event = OnCreateConsumer(mock_descriptor, [])
+
+    event_bytes = bytes(event)
+    comm_channel.send(event_bytes)
+
+    subscriber_list = []
+
+    # Give the channel time to write the message and the listener time to handle it
+    for i in range(20):
+        time.sleep(1)
+        # Retrieve the subscriber list from the backbone and verify it is updated
+        if subscriber_list := backbone.notification_channels:
+            logger.debug(f"The subscriber list was populated after {i} iterations")
+            break
+
+    assert mock_descriptor in subscriber_list
+
+    # now send a shutdown message to terminate the listener
+    return_code = drg_process.returncode
+
+    # clean up if the OnShutdownRequested wasn't properly handled
+    if return_code is None and drg_process.is_alive:
+        drg_process.kill()
+        drg_process.join()
+
+
+def test_dragonbackend_backend_consumer():
+    """Verify the listener background process updates the MLI_BACKEND_CONSUMER
+    value in the backbone."""
+    backend = DragonBackend(pid=9999)
+
+    # We need to let the backend create the backbone to continue
+    backbone = backend._create_backbone()
+    assert backbone._allow_reserved_writes
+
+    # create listener with `as_service=False` to perform a single loop iteration
+    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False)
+
+    logger.debug(f"backbone loaded? {listener._backbone}")
+    logger.debug(f"listener created? {listener}")
+
+    try:
+        # call the service execute method directly to trigger
+        # the entire service lifecycle
+        listener.execute()
+
+        consumer_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+        logger.debug(f"MLI_BACKEND_CONSUMER: {consumer_desc}")
+
+        assert consumer_desc
+    except Exception as ex:
+        logger.info("")
+    finally:
+        listener._on_shutdown()
+
 
+def test_dragonbackend_event_handled():
+    """Verify the event listener process updates the MLI_NOTIFY_CONSUMERS
+    value in the backbone when an event is received and again on shutdown.
+    """
     backend = DragonBackend(pid=9999)
 
-    backend._create_backbone()
-    backbone = backend._backbone
+    # We need to let the backend create the backbone to continue
+    backbone = backend._create_backbone()
 
-    def mock_event_handler(event: EventBase) -> None:
-        logger.debug(f"Handling event in mock handler: {event}")
+    # create the listener to be tested
+    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False)
 
-        bb_descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None)
-        assert bb_descriptor
+    assert listener._backbone, "The listener is not attached to a backbone"
 
-        fs = BackboneFeatureStore.from_descriptor(bb_descriptor)
-        fs[event.uid] = "received"
+    try:
+        # set up the listener but don't let the service event loop start
+        listener._create_eventing()  # listener.execute()
 
-    # create the consumer and start a listener process
-    backend_consumer = backend._create_eventing(backbone)
-    registrar_descriptor = backend._event_consumer.descriptor
+        # grab the channel descriptor so we can simulate registrations
+        channel_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+        comm_channel = DragonCommChannel.from_descriptor(channel_desc)
 
-    # ensure the consumer is stored to backend & published to backbone
-    assert backend._event_consumer == backend_consumer
-    assert backbone.backend_channel == registrar_descriptor
-    assert os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None)
+        num_events = 5
+        events = []
+        for i in range(num_events):
+            # register some mock consumers using the backend channel
+            event = OnCreateConsumer(f"mock-consumer-descriptor-{uuid.uuid4()}", [])
+            event_bytes = bytes(event)
+            comm_channel.send(event_bytes)
+            events.append(event)
 
-    # simulate a new consumer registration
-    new_consumer_ch = DragonCommChannel.from_local()
-    new_consumer = EventConsumer(
-        new_consumer_ch,
+        # run few iterations of the event loop in case it takes a few cycles to write
+        for _ in range(20):
+            listener._on_iteration()
+            # Grab the value that should be getting updated
+            notify_consumers = set(backbone.notification_channels)
+            if len(notify_consumers) == len(events):
+                logger.info(f"Retrieved all consumers after {i} listen cycles")
+                break
+
+        # ... and confirm that all the mock consumer descriptors are registered
+        assert set([e.descriptor for e in events]) == set(notify_consumers)
+        logger.info(f"Number of registered consumers: {len(notify_consumers)}")
+
+    except Exception as ex:
+        logger.exception(f"test_dragonbackend_event_handled - exception occurred: {ex}")
+    finally:
+        # shutdown should unregister a registration listener
+        listener._on_shutdown()
+
+    for i in range(10):
+        if "BackboneFeatureStore.MLI_BACKEND_CONSUMER" not in backbone:
+            logger.debug(f"The listener was removed after {i} iterations")
+            channel_desc = None
+            break
+
+    # we should see that there is no listener registered
+    assert not channel_desc
+
+
+def test_dragonbackend_shutdown_event():
+    """Verify the background process shuts down when it receives a
+    shutdown request."""
+    backend = DragonBackend(pid=9999)
+
+    # We need to let the backend create the backbone to continue
+    backbone = backend._create_backbone()
+
+    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False)
+
+    logger.debug(f"backbone loaded? {listener._backbone}")
+    logger.debug(f"listener created? {listener}")
+
+    try:
+        # set up the listener but don't let the listener loop start
+        listener._create_eventing()  # listener.execute()
+
+        # grab the channel descriptor so we can publish to it
+        channel_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+        comm_channel = DragonCommChannel.from_descriptor(channel_desc)
+
+        assert listener._consumer.listening, "Listener wasn't ready to listen"
+
+        # send a shutdown request...
+        event = OnShutdownRequested()
+        event_bytes = bytes(event)
+        comm_channel.send(event_bytes)
+
+        # run iteration a few times in case it takes a few cycles to write
+        for _ in range(5):
+            listener._on_iteration()
+
+        logger.info(f"{listener._consumer.listening=}")
+
+        # ...and confirm the listener is now cancelled
+        assert not listener._consumer.listening
+
+    except Exception as ex:
+        logger.exception(
+            f"test_dragonbackend_shutdown_event - exception occurred: {ex}"
+        )
+
+
+@pytest.mark.parametrize("health_check_frequency", [10, 20])
+def test_dragonbackend_shutdown_on_health_check(health_check_frequency: float):
+    """Verify that the event listener automatically shuts down when
+    a new listener is registered in its place.
+
+    :param health_check_frequency: The expected frequency of service health check
+     invocations"""
+    backend = DragonBackend(pid=9999)
+
+    # We need to let the backend create the backbone to continue
+    backbone = backend._create_backbone()
+
+    listener = ConsumerRegistrationListener(
         backbone,
+        1.0,
+        1.0,
         [],
-        name="test-consumer-a",
-        event_handler=mock_event_handler,
+        as_service=True,  # allow service to run long enough to health check
+        health_check_frequency=health_check_frequency,
     )
-    assert new_consumer, "new_consumer construction failed"
-
-    # send registration to registrar channel
-    new_consumer.register()
 
-    # the backend consumer should handle updating the notify list and the new
-    # consumer that just broadcast its registration should be registered...
-    # backend_consumer.listen_once(timeout=2.0)
-    backend.listen_to_registrations(timeout=0.1)
+    try:
+        # set up the listener but don't let the listener loop start
+        listener._create_eventing()  # listener.execute()
+        assert listener._consumer.listening, "Listener wasn't ready to listen"
 
-    # # confirm the backend registrar consumer registerd the new listener
-    assert new_consumer_ch.descriptor in backbone.notification_channels
+        # Replace the consumer descriptor in the backbone to trigger
+        # an automatic shutdown
+        backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = str(uuid.uuid4())
 
-    broadcaster = EventBroadcaster(backbone, DragonCommChannel.from_descriptor)
+        # set the last health check manually to verify the duration
+        start_at = time.time()
+        listener._last_health_check = time.time()
 
-    # re-send the same thing because i'm too lazy to create a new consumer
-    broadcast_event = OnCreateConsumer(registrar_descriptor, [])
-    broadcaster.send(broadcast_event, timeout=0.1)
+        # run execute to let the service trigger health checks
+        listener.execute()
+        elapsed = time.time() - start_at
 
-    new_consumer.listen_once(timeout=0.1)
+        # confirm the frequency of the health check was honored
+        assert elapsed >= health_check_frequency
 
-    values = backbone.wait_for(
-        [broadcast_event.uid, BackboneFeatureStore.MLI_NOTIFY_CONSUMERS], 1.0
-    )
-    stored = values[broadcast_event.uid]
-    assert stored == "received", "The handler didn't update the backbone"
+        # ...and confirm the listener is now cancelled
+        assert (
+            not listener._consumer.listening
+        ), "Listener was not automatically shutdown by the health check"
 
-    # confirm that directly retrieving the value isn't different from
-    # using backbone.notification_channels helper method
-    notify_list = str(values[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS]).split(",")
-    assert new_consumer.descriptor in set(notify_list)
+    except Exception as ex:
+        logger.exception(
+            f"test_dragonbackend_shutdown_event - exception occurred: {ex}"
+        )
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 4f511a9c3..df370cbc4 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -29,8 +29,6 @@
 
 import pytest
 
-from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
-
 dragon = pytest.importorskip("dragon")
 
 import multiprocessing as mp
@@ -57,6 +55,7 @@
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim._core.mli.infrastructure.storage.feature_store import (
     FeatureStore,
     ModelKey,
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index 35720fa9d..c08a8f30e 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -34,8 +34,6 @@
 
 import pytest
 
-from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
-
 dragon = pytest.importorskip("dragon")
 
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
@@ -52,6 +50,7 @@
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     time as bbtime,
 )
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index a2df57f3b..819414eca 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -31,17 +31,11 @@
 
 import pytest
 
-from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
-
 torch = pytest.importorskip("torch")
 dragon = pytest.importorskip("dragon")
 
 import multiprocessing as mp
 
-from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
-    BackboneFeatureStore,
-)
-
 try:
     mp.set_start_method("dragon")
 except Exception:
@@ -58,9 +52,13 @@
     EnvironmentConfigLoader,
     WorkerManager,
 )
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim._core.mli.message_handler import MessageHandler
 from smartsim.log import get_logger

From 51baf611cc42d891748fa01abb7cc233abbc6508 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 1 Oct 2024 01:01:35 -0500
Subject: [PATCH 22/40] stringification bug fix

---
 tests/dragon/test_dragon_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py
index 003e27e8c..229855bc5 100644
--- a/tests/dragon/test_dragon_backend.py
+++ b/tests/dragon/test_dragon_backend.py
@@ -203,7 +203,7 @@ def test_dragonbackend_event_handled():
         listener._on_shutdown()
 
     for i in range(10):
-        if "BackboneFeatureStore.MLI_BACKEND_CONSUMER" not in backbone:
+        if BackboneFeatureStore.MLI_BACKEND_CONSUMER not in backbone:
             logger.debug(f"The listener was removed after {i} iterations")
             channel_desc = None
             break

From 3f4af8eb6c451800bd3421817bf56ccb6941b160 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 1 Oct 2024 01:22:07 -0500
Subject: [PATCH 23/40] remove use of deprecated class

---
 .../_core/mli/infrastructure/storage/dragon_feature_store.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
index c8c85623f..dc0f57ae6 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
@@ -68,7 +68,7 @@ def _get(self, key: str) -> t.Union[str, bytes]:
         """
         try:
             return self._storage[key]
-        except dragon_ddict.DDictKeyError as e:
+        except dragon_ddict.DDictError as e:
             raise KeyError(f"Key not found in FeatureStore: {key}") from e
 
     def _set(self, key: str, value: t.Union[str, bytes]) -> None:
@@ -96,7 +96,7 @@ def pop(self, key: str) -> t.Union[str, bytes, None]:
         `"""
         try:
             return self._storage.pop(key)
-        except dragon_ddict.DDictKeyError:
+        except dragon_ddict.DDictError:
             return None
 
     @classmethod

From 979373c2d2686a97a19a4ad439c97851b5d90ae7 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 1 Oct 2024 18:02:05 -0500
Subject: [PATCH 24/40] review changes part 1, improve dragon errors handling,
 add unregister consumers,  moar tests

---
 smartsim/_core/_cli/scripts/dragon_install.py |   4 +-
 .../_core/launcher/dragon/dragonBackend.py    |   3 +-
 .../_core/mli/comm/channel/dragon_channel.py  |   4 +-
 smartsim/_core/mli/comm/channel/dragon_fli.py |   3 +
 .../_core/mli/comm/channel/dragon_util.py     |  36 +-
 .../infrastructure/control/event_listener.py  |  61 ++-
 .../storage/backbone_feature_store.py         | 110 ++++-
 smartsim/protoclient.py                       |   6 +-
 tests/dragon/test_event_consumer.py           | 408 ++++++++++++++++++
 tests/dragon/test_featurestore.py             |  85 ----
 tests/dragon/test_featurestore_integration.py |  84 ----
 tests/test_dragon_comm_utils.py               | 228 ++++++++++
 12 files changed, 810 insertions(+), 222 deletions(-)
 create mode 100644 tests/dragon/test_event_consumer.py
 create mode 100644 tests/test_dragon_comm_utils.py

diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index 662820fed..d9d0ef3c7 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -57,7 +57,7 @@ def __init__(
     def _check(self) -> None:
         """Perform validation of this instance
 
-        :raises: ValueError if any value fails validation"""
+        :raises ValueError: if any value fails validation"""
         if not self.repo_name or len(self.repo_name.split("/")) != 2:
             raise ValueError(
                 f"Invalid dragon repository name. Example: `dragonhpc/dragon`"
@@ -287,7 +287,7 @@ def retrieve_asset(
     :param request: details of a request for the installation of the dragon package
     :param asset: GitHub release asset to retrieve
     :returns: path to the directory containing the extracted release asset
-    :raises: SmartSimCLIActionCancelled if the asset cannot be downloaded or extracted
+    :raises SmartSimCLIActionCancelled: if the asset cannot be downloaded or extracted
     """
     download_dir = request.working_dir / str(asset.id)
 
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index fa28f8690..f5c271518 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -53,7 +53,6 @@
 )
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
-    EventCategory,
 )
 from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim.error.errors import SmartSimError
@@ -592,7 +591,7 @@ def start_event_listener(
             raise SmartSimError("Backbone feature store is not available")
 
         service = ConsumerRegistrationListener(
-            self._backbone, 1.0, 2.0, [EventCategory.CONSUMER_CREATED], True
+            self._backbone, 1.0, 2.0, as_service=True, health_check_frequency=90
         )
 
         options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py
index 7534719e7..4ccf7cf7f 100644
--- a/smartsim/_core/mli/comm/channel/dragon_channel.py
+++ b/smartsim/_core/mli/comm/channel/dragon_channel.py
@@ -55,6 +55,7 @@ def __init__(self, channel: "dch.Channel") -> None:
         descriptor = drg_util.channel_to_descriptor(channel)
         super().__init__(descriptor)
         self._channel = channel
+        """The underlying dragon channel used by this CommChannel for communications"""
 
     @property
     def channel(self) -> "dch.Channel":
@@ -113,9 +114,6 @@ def from_descriptor(
         :raises SmartSimError: If creation of comm channel fails
         """
         try:
-            if isinstance(descriptor, bytes):
-                raise ValueError("Descriptor must be a string")
-
             channel = drg_util.descriptor_to_channel(descriptor)
             return DragonCommChannel(channel)
         except Exception as ex:
diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py
index 13eb58a2e..254a21c5b 100644
--- a/smartsim/_core/mli/comm/channel/dragon_fli.py
+++ b/smartsim/_core/mli/comm/channel/dragon_fli.py
@@ -59,9 +59,12 @@ def __init__(
         super().__init__(descriptor)
 
         self._fli = fli_
+        """The underlying dragon FLInterface used by this CommChannel for communications"""
         self._channel: t.Optional["dch.Channel"] = (
             drg_util.create_local(buffer_size) if sender_supplied else None
         )
+        """The underlying dragon Channel used by a sender-side DragonFLIChannel
+        to attach to the main FLI channel"""
 
     def send(self, value: bytes, timeout: float = 0.001) -> None:
         """Send a message through the underlying communication channel.
diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py
index 014e9c0a4..8edff31c0 100644
--- a/smartsim/_core/mli/comm/channel/dragon_util.py
+++ b/smartsim/_core/mli/comm/channel/dragon_util.py
@@ -25,6 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import base64
+import binascii
 import typing as t
 
 import dragon.channels as dch
@@ -53,6 +54,7 @@ def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str
 
     :param channel: The dragon channel to convert
     :returns: The descriptor string
+    :raises: SmartSimError if a dragon channel is not provided
     """
     if channel is None:
         raise SmartSimError("Channel is not available to create a descriptor")
@@ -78,9 +80,20 @@ def descriptor_to_fli(descriptor: str) -> "fli.FLInterface":
     the string-encoded descriptor.
 
     :param descriptor: The descriptor of an FLI to attach to
-    :returns: The attached dragon FLI"""
-    descriptor_ = base64.b64decode(descriptor.encode("utf-8"))
-    return fli.FLInterface.attach(descriptor_)
+    :returns: The attached dragon FLI
+    :raises ValueError: If the descriptor is empty or incorrectly formatted
+    """
+    if len(descriptor) < 1:
+        raise ValueError("Descriptors may not be empty")
+
+    try:
+        encoded = descriptor.encode("utf-8")
+        descriptor_ = base64.b64decode(encoded)
+        return fli.FLInterface.attach(descriptor_)
+    except binascii.Error:
+        raise ValueError("The descriptor was not properly base64 encoded")
+    except fli.DragonFLIError:
+        raise SmartSimError("The descriptor did not address an available FLI")
 
 
 def descriptor_to_channel(descriptor: str) -> dch.Channel:
@@ -88,9 +101,20 @@ def descriptor_to_channel(descriptor: str) -> dch.Channel:
     the string-encoded descriptor.
 
     :param descriptor: The descriptor of a channel to attach to
-    :returns: The attached dragon Channel"""
-    descriptor_ = base64.b64decode(descriptor.encode("utf-8"))
-    return dch.Channel.attach(descriptor_)
+    :returns: The attached dragon Channel
+    :raises ValueError: If the descriptor is empty or incorrectly formatted
+    :raises SmartSimError: If the descriptor does not attach to a channel"""
+    if len(descriptor) < 1:
+        raise ValueError("Descriptors may not be empty")
+
+    try:
+        encoded = descriptor.encode("utf-8")
+        descriptor_ = base64.b64decode(encoded)
+        return dch.Channel.attach(descriptor_)
+    except binascii.Error:
+        raise ValueError("The descriptor was not properly base64 encoded")
+    except dch.ChannelError:
+        raise SmartSimError("The descriptor did not address an available channel")
 
 
 def create_local(_capacity: int = 0) -> dch.Channel:
diff --git a/smartsim/_core/mli/infrastructure/control/event_listener.py b/smartsim/_core/mli/infrastructure/control/event_listener.py
index 03d7b1ceb..f1b7b664e 100644
--- a/smartsim/_core/mli/infrastructure/control/event_listener.py
+++ b/smartsim/_core/mli/infrastructure/control/event_listener.py
@@ -51,6 +51,7 @@
     EventCategory,
     EventConsumer,
     OnCreateConsumer,
+    OnRemoveConsumer,
 )
 from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
@@ -67,7 +68,6 @@ def __init__(
         backbone: BackboneFeatureStore,
         timeout: float,
         batch_timeout: float,
-        event_filters: t.List[EventCategory],
         as_service: bool = False,
         cooldown: int = 0,
         health_check_frequency: float = 60.0,
@@ -94,9 +94,6 @@ def __init__(
         """Maximum time (in seconds) to allow a batch of receives to
          continue to build"""
 
-        self._filters = event_filters
-        """Filters specifying the message types to handle"""
-
         self._consumer: t.Optional[EventConsumer] = None
         """The event consumer that handles receiving events"""
 
@@ -118,6 +115,9 @@ def _on_shutdown(self) -> None:
         # unregister this listener in the backbone
         self._backbone.pop(BackboneFeatureStore.MLI_BACKEND_CONSUMER)
 
+        # TODO: need the channel to be cleaned up
+        # self._consumer._comm_channel._channel.destroy()
+
     def _on_iteration(self) -> None:
         """Executes calls to the machine learning worker implementation to complete
         the inference pipeline."""
@@ -148,6 +148,33 @@ def _can_shutdown(self) -> bool:
 
         return False
 
+    def _on_unregister(self, event: OnRemoveConsumer) -> None:
+        """Event handler for updating the backbone when new event consumers
+        are registered.
+
+        :param event: The event that was received
+        """
+        notify_list = set(self._backbone.notification_channels)
+
+        # remove the descriptor specified in the event
+        if event.descriptor in notify_list:
+            logger.debug(f"Removing notify consumer: {event.descriptor}")
+            notify_list.remove(event.descriptor)
+
+        # push the updated list back into the backbone
+        self._backbone.notification_channels = list(notify_list)
+
+    def _on_register(self, event: OnCreateConsumer) -> None:
+        """Event handler for updating the backbone when new event consumers
+        are registered.
+
+        :param event: The event that was received
+        """
+        notify_list = set(self._backbone.notification_channels)
+        logger.debug(f"Adding notify consumer: {event.descriptor}")
+        notify_list.add(event.descriptor)
+        self._backbone.notification_channels = list(notify_list)
+
     def _on_event_received(self, event: EventBase) -> None:
         """Event handler for updating the backbone when new event consumers
         are registered.
@@ -157,16 +184,15 @@ def _on_event_received(self, event: EventBase) -> None:
         if self._backbone is None:
             logger.info("Unable to handle event. Backbone is missing.")
 
-        if not isinstance(event, OnCreateConsumer):
+        if isinstance(event, OnCreateConsumer):
+            self._on_register(event)
+        elif isinstance(event, OnRemoveConsumer):
+            self._on_unregister(event)
+        else:
             logger.info(
                 "Consumer registration listener received an "
                 f"unexpected event: {event=}"
             )
-            return
-
-        notify_list = set(self._backbone.notification_channels)
-        notify_list.add(event.descriptor)
-        self._backbone.notification_channels = list(notify_list)
 
     def _on_health_check(self) -> None:
         """Check if this consumer has been replaced by a new listener
@@ -190,9 +216,9 @@ def _on_health_check(self) -> None:
             self._consumer.listening = False
 
     def _publish_consumer(self) -> None:
-        """Publish the consumer descriptor to the backbone."""
+        """Publish the registrar consumer descriptor to the backbone."""
         if self._consumer is None:
-            logger.warning("No consumer descriptor available to publisher")
+            logger.warning("No registrar consumer descriptor available to publisher")
             return
 
         self._backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = (
@@ -227,8 +253,8 @@ def _create_eventing(self) -> EventConsumer:
         self._consumer = EventConsumer(
             event_channel,
             self._backbone,
-            self._filters,
-            name="BackendConsumerRegistrar",
+            [EventCategory.CONSUMER_CREATED, EventCategory.CONSUMER_REMOVED],
+            name="ConsumerRegistrar",
             event_handler=self._on_event_received,
         )
         self._publish_consumer()
@@ -248,17 +274,13 @@ def _create_parser() -> argparse.ArgumentParser:
 
       --timeout
       --batch_timeout
-      --categories
 
     :returns: A configured parser
     """
     arg_parser = argparse.ArgumentParser(prog="ConsumerRegistrarEventListener")
 
-    category_default = EventCategory.CONSUMER_CREATED
-
     arg_parser.add_argument("--timeout", type=float, default=1.0)
     arg_parser.add_argument("--batch_timeout", type=float, default=1.0)
-    arg_parser.add_argument("--categories", type=str, default=category_default)
 
     return arg_parser
 
@@ -285,9 +307,7 @@ def _connect_backbone() -> t.Optional[BackboneFeatureStore]:
     mp.set_start_method("dragon")
 
     parser = _create_parser()
-
     args = parser.parse_args()
-    user_filters: t.List[EventCategory] = list(args.categories.split(","))
 
     backbone_fs = _connect_backbone()
 
@@ -304,7 +324,6 @@ def _connect_backbone() -> t.Optional[BackboneFeatureStore]:
         backbone_fs,
         float(args.timeout),
         float(args.batch_timeout),
-        user_filters,
         as_service=True,
     )
 
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index ffeb917a9..859e767b6 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -260,6 +260,8 @@ class EventCategory(str, enum.Enum):
 
     CONSUMER_CREATED: str = "consumer-created"
     """Event category for an event raised when a new consumer is created"""
+    CONSUMER_REMOVED: str = "consumer-removed"
+    """Event category for an event raised when a new consumer is created"""
     FEATURE_STORE_WRITTEN: str = "feature-store-written"
     """Event category for an event raised when a feature store key is written"""
     SHUTDOWN: str = "shutdown"
@@ -327,6 +329,29 @@ def __str__(self) -> str:
         return f"{str(super())}|{self.descriptor}|{_filters}"
 
 
+class OnRemoveConsumer(EventBase):
+    """Publish this event when a consumer is shutting down and
+    should be removed from notification lists."""
+
+    descriptor: str
+    """Descriptor of the comm channel exposed by the consumer"""
+
+    def __init__(self, descriptor: str) -> None:
+        """Initialize the OnRemoveConsumer event.
+
+        :param descriptor: Descriptor of the comm channel exposed by the consumer
+        """
+        super().__init__(EventCategory.CONSUMER_REMOVED, str(uuid.uuid4()))
+        self.descriptor = descriptor
+
+    def __str__(self) -> str:
+        """Convert the event to a string.
+
+        :returns: A string representation of this instance
+        """
+        return f"{str(super())}|{self.descriptor}"
+
+
 class OnWriteFeatureStore(EventBase):
     """Publish this event when a feature store key is written."""
 
@@ -582,9 +607,13 @@ def send(self, event: EventBase, timeout: float = 0.001) -> int:
 class EventConsumer:
     """Reads system events published to a communications channel."""
 
+    _BACKBONE_WAIT_TIMEOUT = 10.0
+    """Maximum time (in seconds) to wait for the backbone to register the consumer"""
+
     def __init__(
         self,
         comm_channel: CommChannelBase,
+        # channel_factory: ...,
         backbone: BackboneFeatureStore,
         filters: t.Optional[t.List[EventCategory]] = None,
         name: t.Optional[str] = None,
@@ -601,11 +630,24 @@ def __init__(
         :raises ValueError: If batch_timeout <= 0
         """
         self._comm_channel = comm_channel
+        """The comm channel used by the consumer to receive messages. The channel
+        descriptor will be published for senders to discover."""
         self._backbone = backbone
+        """The backbone instance used to bootstrap the instance. The EventConsumer
+        uses the backbone to discover where it can publish its descriptor."""
         self._global_filters = filters or []
+        """A set of global filters to apply to incoming events. Global filters are
+        combined with per-call filters. Filters act as an allow-list."""
         self._name = name
+        """User-friendly name assigned to a consumer for logging. Automatically
+        assigned if not provided."""
         self._event_handler = event_handler
+        """The function that should be executed when an event
+        passed by the filters is received."""
         self.listening = True
+        """Flag indicating that the consumer is currently listening for new
+        events. Setting this flag to `False` will cause any active calls to
+        `listen` to terminate."""
 
     @property
     def descriptor(self) -> str:
@@ -639,10 +681,15 @@ def recv(
         :param batch_timeout: Maximum time to wait for messages to arrive; allows
         multiple batches to be retrieved in one call to `send`
         :returns: A list of events that pass any configured filters
+        :raises ValueError: If a positive, non-zero value is not provided for the
+        timeout or batch_timeout.
         """
         if filters is None:
             filters = []
 
+        if timeout is not None and timeout <= 0:
+            raise ValueError("request timeout must be a non-zero, positive value")
+
         if batch_timeout is not None and batch_timeout <= 0:
             raise ValueError("batch_timeout must be a non-zero, positive value")
 
@@ -688,25 +735,45 @@ def recv(
 
         return events_received
 
+    def _send_to_registrar(self, event: EventBase) -> None:
+        """Send an event direct to the registrar listener."""
+        registrar_key = BackboneFeatureStore.MLI_BACKEND_CONSUMER
+        config = self._backbone.wait_for([registrar_key], self._BACKBONE_WAIT_TIMEOUT)
+        registrar_descriptor = str(config.get(registrar_key, None))
+
+        if not registrar_descriptor:
+            logger.warning(f"Unable to {event.category}. No registrar channel found.")
+            return
+
+        logger.debug(f"Sending {event.category} for {self.name}")
+
+        registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor)
+        registrar_channel.send(bytes(event), timeout=1.0)
+
+        logger.debug(f"{event.category} for {self.name} sent")
+
     def register(self) -> None:
         """Send an event to register this consumer as a listener."""
         descriptor = self._comm_channel.descriptor
         event = OnCreateConsumer(descriptor, self._global_filters)
 
-        registrar_key = BackboneFeatureStore.MLI_BACKEND_CONSUMER
-        config = self._backbone.wait_for([registrar_key], 2.0)
+        self._send_to_registrar(event)
 
-        registrar_descriptor = str(config.get(registrar_key, None))
+    def unregister(self) -> None:
+        """Send an event to un-register this consumer as a listener."""
+        descriptor = self._comm_channel.descriptor
+        event = OnRemoveConsumer(descriptor)
 
-        if registrar_descriptor:
-            logger.debug(f"Sending registration for {self.name}")
+        self._send_to_registrar(event)
 
-            registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor)
-            registrar_channel.send(bytes(event), timeout=1.0)
+    @staticmethod
+    def _on_handler_missing(event: EventBase) -> None:
+        """A "dead letter" event handler that is called to perform
+        processing on events before they're discarded.
 
-            logger.debug(f"Registration for {self.name} sent")
-        else:
-            logger.warning("Unable to register. No registrar channel found.")
+        :param event: The event to handle
+        """
+        logger.warning(f"No event handler is registered. Discarding {event=}")
 
     def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None:
         """Receives messages for the consumer a single time. Delivers
@@ -724,30 +791,41 @@ def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> Non
         logger.debug(f"Starting event listener with {timeout} second timeout")
         logger.debug("Awaiting new messages")
 
+        if not self._event_handler:
+            logger.debug("Unable to handle messages. No event handler is registered.")
+
         incoming_messages = self.recv(timeout=timeout, batch_timeout=batch_timeout)
 
         if not incoming_messages:
-            logger.debug("Consumer received empty message list.")
+            logger.debug(f"Consumer {self.name} received empty message list.")
 
         for message in incoming_messages:
             logger.debug(f"Sending event {message=} to handler.")
             self._handle_shutdown(message)
+
             if self._event_handler:
                 self._event_handler(message)
+            else:
+                self._on_handler_missing(message)
 
-    def _handle_shutdown(self, event: EventBase) -> None:
+    def _handle_shutdown(self, event: EventBase) -> bool:
         """Handles shutdown requests sent to the consumer by setting the
-        `self.listener` property to `False`."""
+        `self.listener` property to `False`.
+
+        :param event: The event to handle
+        :returns: A bool indicating if the event was a shutdown request
+        """
         if isinstance(event, OnShutdownRequested):
             self.listening = False
+            return True
+        return False
 
     def listen(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None:
-        """Receives messages for the consumer until a shutdown request is received
+        """Receives messages for the consumer until a shutdown request is received.
 
         :param timeout: Maximum time to wait (in seconds) for a message to arrive
-        :param timeout: Maximum time to wait (in seconds) for a batch to arrive
+        :param batch_timeout: Maximum time to wait (in seconds) for a batch to arrive
         """
-        self.listening = True
 
         while self.listening:
             self.listen_once(timeout, batch_timeout)
diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py
index 7f6d6f412..d9cdcf594 100644
--- a/smartsim/protoclient.py
+++ b/smartsim/protoclient.py
@@ -108,7 +108,7 @@ def _attach_to_worker_queue(self) -> DragonFLIChannel:
         then attach an FLI to the given worker queue.
 
         :returns: The attached FLI channel
-        :raises: SmartSimError if the required configuration is not found in the
+        :raises SmartSimError: if the required configuration is not found in the
         backbone feature store
         """
 
@@ -150,7 +150,7 @@ def __init__(
         written to file
         :param wait_timeout: Maximum wait time (in seconds) allowed to attach to the
         worker queue
-        :raises: SmartSimError if unable to attach to a backbone featurestore
+        :raises SmartSimError: If unable to attach to a backbone featurestore
         """
         if MPI is not None:
             # TODO: determine a way to make MPI work in the test environment
@@ -266,7 +266,7 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any:
         :param model: The raw bytes or path to a pytorch model
         :param batch: The tensor batch to perform inference on
         :returns: The inference results
-        :raises: ValueError if the worker queue is not configured properly
+        :raises ValueError: if the worker queue is not configured properly
         in the environment variables
         """
         tensors = [batch.numpy()]
diff --git a/tests/dragon/test_event_consumer.py b/tests/dragon/test_event_consumer.py
new file mode 100644
index 000000000..adac966ab
--- /dev/null
+++ b/tests/dragon/test_event_consumer.py
@@ -0,0 +1,408 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import time
+import typing as t
+from unittest import mock
+
+import pytest
+
+from smartsim._core.mli.infrastructure.control.event_listener import (
+    ConsumerRegistrationListener,
+)
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+    EventBase,
+    EventBroadcaster,
+    EventCategory,
+    EventConsumer,
+    OnCreateConsumer,
+    OnRemoveConsumer,
+    OnShutdownRequested,
+    OnWriteFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    time as bbtime,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+# isort: off
+from dragon import fli
+from dragon.channels import Channel
+
+# isort: on
+
+if t.TYPE_CHECKING:
+    import conftest
+
+
+# The tests in this file must run in a dragon environment
+pytestmark = pytest.mark.dragon
+
+
+@pytest.fixture(scope="module")
+def the_storage() -> t.Dict[str, str]:
+    """Fixture to instantiate a dragon distributed dictionary."""
+    return create_ddict(1, 2, 4 * 1024**2)
+
+
+@pytest.fixture(scope="module")
+def the_worker_channel() -> DragonFLIChannel:
+    """Fixture to create a valid descriptor for a worker channel
+    that can be attached to. Does not modify environment vars."""
+    channel_ = create_local()
+    fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
+    comm_channel = DragonFLIChannel(fli_, True)
+    return comm_channel
+
+
+@pytest.fixture(scope="module")
+def the_backbone(
+    the_storage: t.Any, the_worker_channel: DragonFLIChannel
+) -> BackboneFeatureStore:
+    """Fixture to create a distributed dragon dictionary and wrap it
+    in a BackboneFeatureStore.
+
+    :param the_storage: The dragon storage engine to use
+    :param the_worker_channel: Pre-configured worker channel
+    """
+
+    backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True)
+    backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor
+
+    return backbone
+
+
+def test_eventconsumer_eventpublisher_integration(
+    the_backbone: t.Any, test_dir: str
+) -> None:
+    """Verify that the publisher and consumer integrate as expected when
+    multiple publishers and consumers are sending simultaneously. This
+    test closely tracks the test in tests/test_featurestore_base.py also named
+    test_eventconsumer_eventpublisher_integration but requires dragon entities.
+
+    :param the_backbone: The BackboneFeatureStore to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    wmgr_channel = DragonCommChannel(create_local())
+    capp_channel = DragonCommChannel(create_local())
+    back_channel = DragonCommChannel(create_local())
+
+    wmgr_consumer_descriptor = wmgr_channel.descriptor
+    capp_consumer_descriptor = capp_channel.descriptor
+    back_consumer_descriptor = back_channel.descriptor
+
+    # create some consumers to receive messages
+    wmgr_consumer = EventConsumer(
+        wmgr_channel,
+        the_backbone,
+        filters=[EventCategory.FEATURE_STORE_WRITTEN],
+    )
+    capp_consumer = EventConsumer(
+        capp_channel,
+        the_backbone,
+    )
+    back_consumer = EventConsumer(
+        back_channel,
+        the_backbone,
+        filters=[EventCategory.CONSUMER_CREATED],
+    )
+
+    # create some broadcasters to publish messages
+    mock_worker_mgr = EventBroadcaster(
+        the_backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+    mock_client_app = EventBroadcaster(
+        the_backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+
+    # register all of the consumers even though the OnCreateConsumer really should
+    # trigger its registration. event processing is tested elsewhere.
+    the_backbone.notification_channels = [
+        wmgr_consumer_descriptor,
+        capp_consumer_descriptor,
+        back_consumer_descriptor,
+    ]
+
+    # simulate worker manager sending a notification to backend that it's alive
+    event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[])
+    mock_worker_mgr.send(event_1)
+
+    # simulate the app updating a model a few times
+    for key in ["key-1", "key-2", "key-1"]:
+        event = OnWriteFeatureStore(the_backbone.descriptor, key)
+        mock_client_app.send(event, timeout=0.1)
+
+    # worker manager should only get updates about feature update
+    wmgr_messages = wmgr_consumer.recv()
+    assert len(wmgr_messages) == 3
+
+    # the backend should only receive messages about consumer creation
+    back_messages = back_consumer.recv()
+    assert len(back_messages) == 1
+
+    # hypothetical app has no filters and will get all events
+    app_messages = capp_consumer.recv()
+    assert len(app_messages) == 4
+
+
+@pytest.mark.parametrize(
+    " timeout, batch_timeout, exp_err_msg",
+    [(-1, 1, " timeout"), (1, -1, "batch_timeout")],
+)
+def test_eventconsumer_invalid_timeout(
+    timeout: float,
+    batch_timeout: float,
+    exp_err_msg: str,
+    test_dir: str,
+    the_backbone: BackboneFeatureStore,
+) -> None:
+    """Verify that the event consumer raises an exception
+    when provided an invalid request timeout.
+
+    :param timeout: The request timeout for the event consumer recv call
+    :param batch_timeout: The batch timeout for the event consumer recv call
+    :param exp_err_msg: A unique value from the error message that should be raised
+    :param the_storage: The dragon storage engine to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    wmgr_channel = DragonCommChannel(create_local())
+
+    # create some consumers to receive messages
+    wmgr_consumer = EventConsumer(
+        wmgr_channel,
+        the_backbone,
+        filters=[EventCategory.FEATURE_STORE_WRITTEN],
+    )
+
+    # the consumer should report an error for the invalid timeout value
+    with pytest.raises(ValueError) as ex:
+        wmgr_consumer.recv(timeout=timeout, batch_timeout=batch_timeout)
+
+    assert exp_err_msg in ex.value.args[0]
+
+
+def test_eventconsumer_no_event_handler_registered(
+    the_backbone: t.Any, test_dir: str
+) -> None:
+    """Verify that a consumer discards messages when
+    on a channel if no handler is registered.
+
+    :param the_backbone: The BackboneFeatureStore to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    wmgr_channel = DragonCommChannel(create_local())
+
+    # create a consumer to receive messages
+    wmgr_consumer = EventConsumer(wmgr_channel, the_backbone, event_handler=None)
+
+    # create a broadcasters to publish messages
+    mock_worker_mgr = EventBroadcaster(
+        the_backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+
+    # manually register the consumers since we don't have a backend running
+    the_backbone.notification_channels = [wmgr_channel.descriptor]
+
+    # simulate the app updating a model a few times
+    for key in ["key-1", "key-2", "key-1"]:
+        event = OnWriteFeatureStore(the_backbone.descriptor, key)
+        mock_worker_mgr.send(event, timeout=0.1)
+
+    # run the handler and let it discard messages
+    for _ in range(15):
+        wmgr_consumer.listen_once(0.2, 2.0)
+
+    assert wmgr_consumer.listening
+
+
+def test_eventconsumer_no_event_handler_registered_shutdown(
+    the_backbone: t.Any, test_dir: str
+) -> None:
+    """Verify that a consumer without an event handler
+    registered still honors shutdown requests.
+
+    :param the_backbone: The BackboneFeatureStore to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    wmgr_channel = DragonCommChannel(create_local())
+    capp_channel = DragonCommChannel(create_local())
+
+    # create a consumers to receive messages
+    wmgr_consumer = EventConsumer(wmgr_channel, the_backbone)
+
+    # create a broadcaster to publish messages
+    mock_worker_mgr = EventBroadcaster(
+        the_backbone,
+        channel_factory=DragonCommChannel.from_descriptor,
+    )
+
+    # manually register the consumers since we don't have a backend running
+    the_backbone.notification_channels = [
+        wmgr_channel.descriptor,
+        capp_channel.descriptor,
+    ]
+
+    # simulate the app updating a model a few times
+    for key in ["key-1", "key-2", "key-1"]:
+        event = OnWriteFeatureStore(the_backbone.descriptor, key)
+        mock_worker_mgr.send(event, timeout=0.1)
+
+    event = OnShutdownRequested()
+    mock_worker_mgr.send(event, timeout=0.1)
+
+    # wmgr will stop listening to messages when it is told to stop listening
+    wmgr_consumer.listen(timeout=0.1, batch_timeout=2.0)
+
+    for _ in range(15):
+        wmgr_consumer.listen_once(timeout=0.1, batch_timeout=2.0)
+
+    # confirm the messages were processed, discarded, and the shutdown was received
+    assert wmgr_consumer.listening == False
+
+
+def test_eventconsumer_registration(
+    the_backbone: t.Any, test_dir: str, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that a consumer is correctly registered in
+    the backbone after sending a registration request. Then,
+    Confirm the consumer is unregistered after sending the
+    un-register request.
+
+    :param the_backbone: The BackboneFeatureStore to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    with monkeypatch.context() as patch:
+        registrar = ConsumerRegistrationListener(
+            the_backbone, 1.0, 2.0, as_service=False
+        )
+
+        # NOTE: service.execute(as_service=False) will complete the service life-
+        # cycle and remove the registrar from the backbone, so mock _on_shutdown
+        disabled_shutdown = mock.MagicMock()
+        patch.setattr(registrar, "_on_shutdown", disabled_shutdown)
+
+        # initialze registrar resources
+        registrar.execute()
+
+        # create a consumer that will be registered
+        wmgr_channel = DragonCommChannel(create_local())
+        wmgr_consumer = EventConsumer(wmgr_channel, the_backbone)
+
+        registered_channels = the_backbone.notification_channels
+
+        # trigger the consumer-to-registrar handshake
+        wmgr_consumer.register()
+
+        current_registrations: t.List[str] = []
+
+        # have the registrar run a few times to pick up the msg
+        for i in range(15):
+            registrar.execute()
+            current_registrations = the_backbone.notification_channels
+            if len(current_registrations) != len(registered_channels):
+                logger.debug(f"The event was processed on iteration {i}")
+                break
+
+        # confirm the consumer is registered
+        assert wmgr_channel.descriptor in current_registrations
+
+        # copy old list so we can compare against it.
+        registered_channels = list(current_registrations)
+
+        # trigger the consumer removal
+        wmgr_consumer.unregister()
+
+        # have the registrar run a few times to pick up the msg
+        for i in range(15):
+            registrar.execute()
+            current_registrations = the_backbone.notification_channels
+            if len(current_registrations) != len(registered_channels):
+                logger.debug(f"The event was processed on iteration {i}")
+                break
+
+        # confirm the consumer is no longer registered
+        assert wmgr_channel.descriptor not in current_registrations
+
+
+def test_registrar_teardown(
+    the_backbone: t.Any, test_dir: str, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Verify that the consumer registrar removes itself from
+    the backbone when it shuts down.
+
+    :param the_backbone: The BackboneFeatureStore to use
+    :param test_dir: Automatically generated unique working
+    directories for individual test outputs
+    """
+
+    with monkeypatch.context() as patch:
+        registrar = ConsumerRegistrationListener(
+            the_backbone, 1.0, 2.0, as_service=False
+        )
+
+        # directly initialze registrar resources to avoid service life-cycle
+        registrar._create_eventing()
+
+        # confirm the registrar is published to the backbone
+        cfg = the_backbone.wait_for([BackboneFeatureStore.MLI_BACKEND_CONSUMER], 10)
+        assert BackboneFeatureStore.MLI_BACKEND_CONSUMER in cfg
+
+        # execute the entire service lifecycle 1x
+        registrar.execute()
+
+        consumer_found = BackboneFeatureStore.MLI_BACKEND_CONSUMER in the_backbone
+
+        for i in range(15):
+            time.sleep(0.1)
+            consumer_found = BackboneFeatureStore.MLI_BACKEND_CONSUMER in the_backbone
+            if not consumer_found:
+                logger.debug(f"Registrar removed from the backbone on iteration {i}")
+                break
+
+        assert BackboneFeatureStore.MLI_BACKEND_CONSUMER not in the_backbone
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index c08a8f30e..e34120c98 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -102,91 +102,6 @@ def the_backbone(
     return backbone
 
 
-def test_eventconsumer_eventpublisher_integration(
-    the_backbone: BackboneFeatureStore, test_dir: str
-) -> None:
-    """Verify that the publisher and consumer integrate as expected when
-    multiple publishers and consumers are sending simultaneously. This
-    test closely tracks the test in tests/test_featurestore.py also named
-    test_eventconsumer_eventpublisher_integration but requires dragon entities.
-
-    :param the_backbone: the dragon storage engine to use
-    :param test_dir: pytest fixture automatically generating unique working
-    directories for individual test outputs
-    """
-
-    # verify ability to write and read from ddict
-    the_backbone["test_dir"] = test_dir
-    assert the_backbone["test_dir"] == test_dir
-
-    wmgr_channel = DragonCommChannel(create_local())
-    capp_channel = DragonCommChannel(create_local())
-    back_channel = DragonCommChannel(create_local())
-
-    wmgr_consumer_descriptor = wmgr_channel.descriptor
-    capp_consumer_descriptor = capp_channel.descriptor
-    back_consumer_descriptor = back_channel.descriptor
-
-    # create some consumers to receive messages
-    wmgr_consumer = EventConsumer(
-        wmgr_channel,
-        the_backbone,
-        filters=[EventCategory.FEATURE_STORE_WRITTEN],
-    )
-    capp_consumer = EventConsumer(
-        capp_channel,
-        the_backbone,
-    )
-    back_consumer = EventConsumer(
-        back_channel,
-        the_backbone,
-        filters=[EventCategory.CONSUMER_CREATED],
-    )
-
-    # create some broadcasters to publish messages
-    mock_worker_mgr = EventBroadcaster(
-        the_backbone,
-        channel_factory=DragonCommChannel.from_descriptor,
-    )
-    mock_client_app = EventBroadcaster(
-        the_backbone,
-        channel_factory=DragonCommChannel.from_descriptor,
-    )
-
-    # register all of the consumers even though the OnCreateConsumer really should
-    # trigger its registration. event processing is tested elsewhere.
-    the_backbone.notification_channels = [
-        wmgr_consumer_descriptor,
-        capp_consumer_descriptor,
-        back_consumer_descriptor,
-    ]
-
-    # simulate worker manager sending a notification to backend that it's alive
-    event_1 = OnCreateConsumer(wmgr_consumer_descriptor, [])
-    mock_worker_mgr.send(event_1)
-
-    # simulate the app updating a model a few times
-    event_2 = OnWriteFeatureStore(the_backbone.descriptor, "key-1")
-    event_3 = OnWriteFeatureStore(the_backbone.descriptor, "key-2")
-    event_4 = OnWriteFeatureStore(the_backbone.descriptor, "key-1")
-
-    mock_client_app.send(event_2)
-    mock_client_app.send(event_3)
-    mock_client_app.send(event_4)
-
-    # worker manager should only get updates about feature update
-    wmgr_messages = wmgr_consumer.recv()
-    assert len(wmgr_messages) == 3
-
-    # the backend should only receive messages about consumer creation
-    back_messages = back_consumer.recv()
-    assert len(back_messages) == 1
-
-    # hypothetical app has no filters and will get all events
-    app_messages = capp_consumer.recv()
-    assert len(app_messages) == 4
-
-
 def test_backbone_wait_for_no_keys(
     the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch
 ) -> None:
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index e4d6bb9eb..895bc6467 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -86,90 +86,6 @@ def the_backbone(the_storage: t.Any) -> BackboneFeatureStore:
     return BackboneFeatureStore(the_storage, allow_reserved_writes=True)
 
 
-def test_eventconsumer_eventpublisher_integration(
-    the_storage: t.Any, test_dir: str
-) -> None:
-    """Verify that the publisher and consumer integrate as expected when
-    multiple publishers and consumers are sending simultaneously. This
-    test closely tracks the test in tests/test_featurestore.py also named
-    test_eventconsumer_eventpublisher_integration but requires dragon entities.
-
-    :param the_storage: The dragon storage engine to use
-    :param test_dir: Automatically generated unique working
-    directories for individual test outputs
-    """
-
-    mock_storage = the_storage
-    backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
-
-    # verify ability to write and read from ddict
-    backbone["test_dir"] = test_dir
-    assert backbone["test_dir"] == test_dir
-
-    wmgr_channel = DragonCommChannel(create_local())
-    capp_channel = DragonCommChannel(create_local())
-    back_channel = DragonCommChannel(create_local())
-
-    wmgr_consumer_descriptor = wmgr_channel.descriptor
-    capp_consumer_descriptor = capp_channel.descriptor
-    back_consumer_descriptor = back_channel.descriptor
-
-    # create some consumers to receive messages
-    wmgr_consumer = EventConsumer(
-        wmgr_channel,
-        backbone,
-        filters=[EventCategory.FEATURE_STORE_WRITTEN],
-    )
-    capp_consumer = EventConsumer(
-        capp_channel,
-        backbone,
-    )
-    back_consumer = EventConsumer(
-        back_channel,
-        backbone,
-        filters=[EventCategory.CONSUMER_CREATED],
-    )
-
-    # create some broadcasters to publish messages
-    mock_worker_mgr = EventBroadcaster(
-        backbone,
-        channel_factory=DragonCommChannel.from_descriptor,
-    )
-    mock_client_app = EventBroadcaster(
-        backbone,
-        channel_factory=DragonCommChannel.from_descriptor,
-    )
-
-    # register all of the consumers even though the OnCreateConsumer really should
-    # trigger its registration. event processing is tested elsewhere.
-    backbone.notification_channels = [
-        wmgr_consumer_descriptor,
-        capp_consumer_descriptor,
-        back_consumer_descriptor,
-    ]
-
-    # simulate worker manager sending a notification to backend that it's alive
-    event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[])
-    mock_worker_mgr.send(event_1)
-
-    # simulate the app updating a model a few times
-    for key in ["key-1", "key-2", "key-1"]:
-        event = OnWriteFeatureStore(backbone.descriptor, key)
-        mock_client_app.send(event, timeout=0.1)
-
-    # worker manager should only get updates about feature update
-    wmgr_messages = wmgr_consumer.recv()
-    assert len(wmgr_messages) == 3
-
-    # the backend should only receive messages about consumer creation
-    back_messages = back_consumer.recv()
-    assert len(back_messages) == 1
-
-    # hypothetical app has no filters and will get all events
-    app_messages = capp_consumer.recv()
-    assert len(app_messages) == 4
-
-
 @pytest.mark.parametrize(
     "num_events, batch_timeout, max_batches_expected",
     [
diff --git a/tests/test_dragon_comm_utils.py b/tests/test_dragon_comm_utils.py
new file mode 100644
index 000000000..06d6e19b3
--- /dev/null
+++ b/tests/test_dragon_comm_utils.py
@@ -0,0 +1,228 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+import uuid
+
+import pytest
+
+from smartsim.error.errors import SmartSimError
+
+dragon = pytest.importorskip("dragon")
+
+# isort: off
+import dragon.channels as dch
+import dragon.fli as fli
+
+# isort: on
+
+from smartsim._core.mli.comm.channel import dragon_util
+from smartsim.log import get_logger
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+logger = get_logger(__name__)
+
+
+@pytest.fixture(scope="function")
+def the_channel() -> dch.Channel:
+    """Creates a Channel attached to the local memory pool."""
+    channel = dch.Channel.make_process_local()
+    return channel
+
+
+@pytest.fixture(scope="function")
+def the_fli(the_channel) -> fli.FLInterface:
+    """Creates an FLI attached to the local memory pool."""
+    fli_ = fli.FLInterface(main_ch=the_channel, manager_ch=None)
+    return fli_
+
+
+def test_descriptor_to_channel_empty() -> None:
+    """Verify that `descriptor_to_channel` raises an exception when
+    provided with an empty descriptor."""
+    descriptor = ""
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    assert "empty" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    ["a", "ab", "abc", "x1", pathlib.Path(".").absolute().as_posix()],
+)
+def test_descriptor_to_channel_b64fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_channel` raises an exception when
+    provided with an incorrectly encoded descriptor.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    assert "base64" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    [str(uuid.uuid4())],
+)
+def test_descriptor_to_channel_channel_fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_channel` raises an exception when a correctly
+    formatted descriptor that does not describe a real channel is passed.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    # ensure we're receiving the right exception
+    assert "address" in ex.value.args[0]
+    assert "channel" in ex.value.args[0]
+
+
+def test_descriptor_to_channel_channel_not_available(the_channel: dch.Channel) -> None:
+    """Verify that `descriptor_to_channel` raises an exception when a channel
+    is no longer available.
+
+    :param the_channel: A dragon channel
+    """
+
+    # get a good descriptor & wipe out the channel so it can't be attached
+    descriptor = dragon_util.channel_to_descriptor(the_channel)
+    the_channel.destroy()
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_channel(descriptor)
+
+    assert "address" in ex.value.args[0]
+
+
+def test_descriptor_to_channel_happy_path(the_channel: dch.Channel) -> None:
+    """Verify that `descriptor_to_channel` works as expected when provided
+    a valid descriptor
+
+    :param the_channel: A dragon channel
+    """
+
+    # get a good descriptor
+    descriptor = dragon_util.channel_to_descriptor(the_channel)
+
+    reattached = dragon_util.descriptor_to_channel(descriptor)
+    assert reattached
+
+    # and just make sure creation of the descriptor is transitive
+    assert dragon_util.channel_to_descriptor(reattached) == descriptor
+
+
+def test_descriptor_to_fli_empty() -> None:
+    """Verify that `descriptor_to_fli` raises an exception when
+    provided with an empty descriptor."""
+    descriptor = ""
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    assert "empty" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    ["a", "ab", "abc", "x1", pathlib.Path(".").absolute().as_posix()],
+)
+def test_descriptor_to_fli_b64fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_fli` raises an exception when
+    provided with an incorrectly encoded descriptor.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    assert "base64" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "descriptor",
+    [str(uuid.uuid4())],
+)
+def test_descriptor_to_fli_fli_fail(descriptor: str) -> None:
+    """Verify that `descriptor_to_fli` raises an exception when a correctly
+    formatted descriptor that does not describe a real FLI is passed.
+
+    :param descriptor: A descriptor that is not properly base64 encoded
+    """
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    # ensure we're receiving the right exception
+    assert "address" in ex.value.args[0]
+    assert "fli" in ex.value.args[0].lower()
+
+
+def test_descriptor_to_fli_fli_not_available(
+    the_fli: fli.FLInterface, the_channel: dch.Channel
+) -> None:
+    """Verify that `descriptor_to_fli` raises an exception when a channel
+    is no longer available.
+
+    :param the_fli: A dragon FLInterface
+    :param the_channel: A dragon channel
+    """
+
+    # get a good descriptor & wipe out the FLI so it can't be attached
+    descriptor = dragon_util.channel_to_descriptor(the_fli)
+    the_fli.destroy()
+    the_channel.destroy()
+
+    with pytest.raises(SmartSimError) as ex:
+        dragon_util.descriptor_to_fli(descriptor)
+
+    # ensure we're receiving the right exception
+    assert "address" in ex.value.args[0]
+
+
+def test_descriptor_to_fli_happy_path(the_fli: dch.Channel) -> None:
+    """Verify that `descriptor_to_fli` works as expected when provided
+    a valid descriptor
+
+    :param the_fli: A dragon FLInterface
+    """
+
+    # get a good descriptor
+    descriptor = dragon_util.channel_to_descriptor(the_fli)
+
+    reattached = dragon_util.descriptor_to_fli(descriptor)
+    assert reattached
+
+    # and just make sure creation of the descriptor is transitive
+    assert dragon_util.channel_to_descriptor(reattached) == descriptor

From 5898005662045e5245a97a3202c63b8505c5f8e7 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 1 Oct 2024 18:16:18 -0500
Subject: [PATCH 25/40] import order follow-up

---
 smartsim/_core/mli/comm/channel/dragon_util.py         |  2 +-
 .../_core/mli/infrastructure/control/event_listener.py | 10 +++++-----
 .../infrastructure/storage/backbone_feature_store.py   |  2 +-
 tests/dragon/test_event_consumer.py                    |  7 +++----
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py
index 8edff31c0..258d84b3a 100644
--- a/smartsim/_core/mli/comm/channel/dragon_util.py
+++ b/smartsim/_core/mli/comm/channel/dragon_util.py
@@ -54,7 +54,7 @@ def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str
 
     :param channel: The dragon channel to convert
     :returns: The descriptor string
-    :raises: SmartSimError if a dragon channel is not provided
+    :raises SmartSimError: If a dragon channel is not provided
     """
     if channel is None:
         raise SmartSimError("Channel is not available to create a descriptor")
diff --git a/smartsim/_core/mli/infrastructure/control/event_listener.py b/smartsim/_core/mli/infrastructure/control/event_listener.py
index f1b7b664e..2485f77ea 100644
--- a/smartsim/_core/mli/infrastructure/control/event_listener.py
+++ b/smartsim/_core/mli/infrastructure/control/event_listener.py
@@ -149,8 +149,8 @@ def _can_shutdown(self) -> bool:
         return False
 
     def _on_unregister(self, event: OnRemoveConsumer) -> None:
-        """Event handler for updating the backbone when new event consumers
-        are registered.
+        """Event handler for updating the backbone when event consumers
+        are un-registered.
 
         :param event: The event that was received
         """
@@ -176,8 +176,8 @@ def _on_register(self, event: OnCreateConsumer) -> None:
         self._backbone.notification_channels = list(notify_list)
 
     def _on_event_received(self, event: EventBase) -> None:
-        """Event handler for updating the backbone when new event consumers
-        are registered.
+        """Primary event handler for the listener. Distributes events to
+        type-specific handlers.
 
         :param event: The event that was received
         """
@@ -290,7 +290,7 @@ def _connect_backbone() -> t.Optional[BackboneFeatureStore]:
     Load the backbone by retrieving the descriptor from environment variables.
 
     :returns: The backbone feature store
-    :raises: SmartSimError if a descriptor is not found
+    :raises SmartSimError: if a descriptor is not found
     """
     descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, "")
 
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index 859e767b6..21fdecbed 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -414,7 +414,7 @@ def send(self, event: EventBase, timeout: float = 0.001) -> int:
         :param event: The event to send
         :param timeout: Maximum time to wait (in seconds) for messages to send
         :returns: The number of message copies that were sent
-        :raises: SmartSimError if the comm channel is not configured
+        :raises SmartSimError: If the comm channel is not configured
         """
         if self._channel is None:
             raise SmartSimError("No channel to send on")
diff --git a/tests/dragon/test_event_consumer.py b/tests/dragon/test_event_consumer.py
index adac966ab..f361e6c16 100644
--- a/tests/dragon/test_event_consumer.py
+++ b/tests/dragon/test_event_consumer.py
@@ -30,15 +30,14 @@
 
 import pytest
 
-from smartsim._core.mli.infrastructure.control.event_listener import (
-    ConsumerRegistrationListener,
-)
-
 dragon = pytest.importorskip("dragon")
 
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.control.event_listener import (
+    ConsumerRegistrationListener,
+)
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
     EventBase,

From af870f95e7e64bd4d7ea014b0a979422dd6157d0 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Thu, 3 Oct 2024 19:41:56 -0500
Subject: [PATCH 26/40] pr review updates

---
 conftest.py                                   |  14 +-
 ex/high_throughput_inference/mock_app.py      |   2 +-
 .../standalone_worker_manager.py              |   2 +-
 smartsim/_core/_cli/scripts/dragon_install.py |   3 +-
 smartsim/_core/entrypoints/service.py         |  26 +-
 .../_core/launcher/dragon/dragonBackend.py    |  23 +-
 .../_core/launcher/dragon/dragonConnector.py  |   3 +
 .../_core/mli/comm/channel/dragon_channel.py  |   8 -
 smartsim/_core/mli/comm/channel/dragon_fli.py |   8 +-
 .../_core/mli/comm/channel/dragon_util.py     |  53 +-
 .../_core/mli/infrastructure/comm/__init__.py |   0
 .../mli/infrastructure/comm/broadcaster.py    | 238 +++++++
 .../_core/mli/infrastructure/comm/consumer.py | 283 ++++++++
 .../_core/mli/infrastructure/comm/event.py    | 162 +++++
 .../_core/mli/infrastructure/comm/producer.py |  44 ++
 .../{event_listener.py => listener.py}        |  67 +-
 .../control/request_dispatcher.py             |   2 +-
 .../mli/infrastructure/environment_loader.py  |   9 +-
 .../storage/backbone_feature_store.py         | 616 +-----------------
 .../storage/dragon_feature_store.py           |  10 +-
 .../mli/infrastructure/storage/dragon_util.py |   1 +
 .../infrastructure/storage/feature_store.py   |   2 +-
 .../_core/mli/infrastructure/worker/worker.py |   2 +-
 smartsim/_core/utils/timings.py               |   8 +-
 smartsim/protoclient.py                       |  42 +-
 tests/dragon/test_dragon_backend.py           | 178 ++---
 tests/dragon/test_environment_loader.py       |  18 +-
 tests/dragon/test_error_handling.py           |  11 +
 tests/dragon/test_event_consumer.py           |  62 +-
 tests/dragon/test_featurestore.py             |   6 -
 tests/dragon/test_featurestore_base.py        | 114 +++-
 tests/dragon/test_featurestore_integration.py |  22 +-
 tests/dragon/test_protoclient.py              |  11 +-
 tests/dragon/test_worker_manager.py           |   6 +-
 tests/dragon/utils/msg_pump.py                |   3 +-
 tests/mli/test_service.py                     | 109 +++-
 tests/test_dragon_comm_utils.py               |  29 +
 37 files changed, 1281 insertions(+), 916 deletions(-)
 create mode 100644 smartsim/_core/mli/infrastructure/comm/__init__.py
 create mode 100644 smartsim/_core/mli/infrastructure/comm/broadcaster.py
 create mode 100644 smartsim/_core/mli/infrastructure/comm/consumer.py
 create mode 100644 smartsim/_core/mli/infrastructure/comm/event.py
 create mode 100644 smartsim/_core/mli/infrastructure/comm/producer.py
 rename smartsim/_core/mli/infrastructure/control/{event_listener.py => listener.py} (84%)

diff --git a/conftest.py b/conftest.py
index 098a4a0c5..7302482e6 100644
--- a/conftest.py
+++ b/conftest.py
@@ -93,6 +93,7 @@
 test_hostlist = None
 has_aprun = shutil.which("aprun") is not None
 
+
 def get_account() -> str:
     return test_account
 
@@ -459,15 +460,10 @@ def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None:
 
 @pytest.fixture(scope="function", autouse=True)
 def check_output_dir() -> None:
-    try:
-        global test_output_dirs
-        assert os.path.isdir(test_output_root)
-        assert len(os.listdir(test_output_root)) >= test_output_dirs
-        test_output_dirs = len(os.listdir(test_output_root))
-    except Exception:
-        # swallow error when the tests can't clean up test dirs
-        # and let the next run do the job.
-        ...
+    global test_output_dirs
+    assert os.path.isdir(test_output_root)
+    assert len(os.listdir(test_output_root)) >= test_output_dirs
+    test_output_dirs = len(os.listdir(test_output_root))
 
 
 @pytest.fixture
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index f4db1bc1e..876f9145a 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -53,7 +53,7 @@
 from smartsim.log import get_logger, log_to_file
 from smartsim.protoclient import ProtoClient
 
-logger = get_logger("App", "DEBUG")
+logger = get_logger("App")
 
 
 CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False
diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py
index fdef4268a..9a3926803 100644
--- a/ex/high_throughput_inference/standalone_worker_manager.py
+++ b/ex/high_throughput_inference/standalone_worker_manager.py
@@ -135,7 +135,7 @@ def service_as_dragon_proc(
     args = parser.parse_args()
 
     connect_to_infrastructure()
-    ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"]
+    ddict_str = os.environ[BackboneFeatureStore.MLI_BACKBONE]
 
     backbone = BackboneFeatureStore.from_descriptor(ddict_str)
 
diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index d9d0ef3c7..b6666f7c8 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -95,14 +95,13 @@ def get_auth_token(request: DragonInstallRequest) -> t.Optional[Token]:
 def create_dotenv(dragon_root_dir: pathlib.Path, dragon_version: str) -> None:
     """Create a .env file with required environment variables for the Dragon runtime"""
     dragon_root = str(dragon_root_dir)
-    dragon_rut_dir = dragon_root
     dragon_inc_dir = dragon_root + "/include"
     dragon_lib_dir = dragon_root + "/lib"
     dragon_bin_dir = dragon_root + "/bin"
 
     dragon_vars = {
         "DRAGON_BASE_DIR": dragon_root,
-        "DRAGON_ROOT_DIR": dragon_rut_dir,
+        "DRAGON_ROOT_DIR": dragon_root,
         "DRAGON_INCLUDE_DIR": dragon_inc_dir,
         "DRAGON_LIB_DIR": dragon_lib_dir,
         "DRAGON_VERSION": dragon_version,
diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py
index 27d541312..497bdda2f 100644
--- a/smartsim/_core/entrypoints/service.py
+++ b/smartsim/_core/entrypoints/service.py
@@ -42,19 +42,21 @@ class Service(ABC):
     def __init__(
         self,
         as_service: bool = False,
-        cooldown: int = 0,
-        loop_delay: int = 0,
+        cooldown: float = 0,
+        loop_delay: float = 0,
         health_check_frequency: float = 0,
     ) -> None:
         """Initialize the ServiceHost
 
-        :param as_service: Determines if the host will run until shutdown criteria
-        are met or as a run-once instance
-        :param cooldown: Period of time to allow service to run before automatic
-        shutdown, in seconds. A non-zero, positive integer.
-        :param loop_delay: Delay between iterations of the event loop (in seconds)
-        :param health_check_frequency: Delay between calls to a
-        health check handler (in seconds)
+        :param as_service: Determines if the host runs continuously until
+        shutdown criteria are met, or executes the service lifecycle once and exits
+        :param cooldown: Period of time (in seconds) to allow the service to run
+         after a shutdown is permitted. Enables the service to avoid restarting if
+         new work is discovered. A value of 0 disables the cooldown.
+        :param loop_delay: Time (in seconds) between iterations of the event loop
+        :param health_check_frequency: Time (in seconds) between calls to a
+         health check handler. A value of 0 triggers the health check on every
+         iteration.
         """
         self._as_service = as_service
         """If the service should run until shutdown function returns True"""
@@ -64,8 +66,8 @@ def __init__(
         self._loop_delay = abs(loop_delay)
         """Forced delay between iterations of the event loop"""
         self._health_check_frequency = health_check_frequency
-        """The time (in seconds) between desired health checks. A health check
-        frequency of zero will never trigger the health check."""
+        """The time (in seconds) between desired health checks. Frequency of 0
+        will trigger the health check on every event loop iteration."""
         self._last_health_check = time.time()
         """The timestamp of the latest health check"""
 
@@ -135,7 +137,7 @@ def execute(self) -> None:
                     "Failure in event loop resulted in service termination"
                 )
 
-            if self._health_check_frequency > 0:
+            if self._health_check_frequency >= 0:
                 hc_elapsed = time.time() - self._last_health_check
                 if hc_elapsed >= self._health_check_frequency:
                     self._on_health_check()
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index f5c271518..fb33460d8 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -48,7 +48,7 @@
 import dragon.native.machine as dragon_machine
 
 from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter
-from smartsim._core.mli.infrastructure.control.event_listener import (
+from smartsim._core.mli.infrastructure.control.listener import (
     ConsumerRegistrationListener,
 )
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
@@ -158,6 +158,7 @@ class DragonBackend:
     """
 
     _DEFAULT_NUM_MGR_PER_NODE = 2
+    """The default number of manager processes for each feature store node"""
     _DEFAULT_MEM_PER_NODE = 256 * 1024**2
     """The default memory capacity to allocate for a feaure store node (in megabytes)"""
 
@@ -550,7 +551,9 @@ def _stop_steps(self) -> None:
 
     def _create_backbone(self) -> BackboneFeatureStore:
         """
-        Create a BackboneFeatureStore if one does not exist.
+        Creates a BackboneFeatureStore if one does not exist. Updates
+        environment variables of this process to include the backbone
+        descriptor.
 
         :returns: The descriptor of the backbone feature store
         """
@@ -587,6 +590,13 @@ def _initialize_cooldown() -> int:
     def start_event_listener(
         self, cpu_affinity: list[int], gpu_affinity: list[int]
     ) -> dragon_process.Process:
+        """Start a standalone event listener.
+
+        :param cpu_affinity: The CPU affinity for the process
+        :param gpu_affinity: The CPU affinity for the process
+        :returns: The dragon Process managing the process
+        :raises SmartSimError: If the backbone is not provided
+        """
         if self._backbone is None:
             raise SmartSimError("Backbone feature store is not available")
 
@@ -607,7 +617,7 @@ def start_event_listener(
             cwd=os.getcwd(),
             env={
                 **os.environ,
-                **(self._backbone.get_env() if self._backbone is not None else {}),
+                **self._backbone.get_env(),
             },
             policy=local_policy,
             options=options,
@@ -657,6 +667,7 @@ def create_run_policy(
         )
 
     def _start_steps(self) -> None:
+        """Start all new steps created since the last update."""
         self._heartbeat()
 
         with self._queue_lock:
@@ -821,6 +832,9 @@ def _refresh_statuses(self) -> None:
                     group_info.redir_workers = None
 
     def _update_shutdown_status(self) -> None:
+        """Query the status of running tasks and update the status
+        of any that have completed.
+        """
         self._heartbeat()
         with self._queue_lock:
             self._can_shutdown |= (
@@ -834,6 +848,9 @@ def _update_shutdown_status(self) -> None:
             )
 
     def _should_print_status(self) -> bool:
+        """Determine if status messages should be printed based off the last
+        update. Returns `True` to trigger prints, `False` otherwise.
+        """
         if self.current_time - self._last_update_time > 10:
             self._last_update_time = self.current_time
             return True
diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py
index 9cbc55674..98670f347 100644
--- a/smartsim/_core/launcher/dragon/dragonConnector.py
+++ b/smartsim/_core/launcher/dragon/dragonConnector.py
@@ -245,6 +245,9 @@ def load_persisted_env(self) -> t.Dict[str, str]:
 
         with open(config.dragon_dotenv, encoding="utf-8") as dot_env:
             for kvp in dot_env.readlines():
+                if not kvp:
+                    continue
+
                 # skip any commented lines
                 if not kvp.startswith("#"):
                     split = kvp.strip().split("=", maxsplit=1)
diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py
index 4ccf7cf7f..110f19258 100644
--- a/smartsim/_core/mli/comm/channel/dragon_channel.py
+++ b/smartsim/_core/mli/comm/channel/dragon_channel.py
@@ -35,14 +35,6 @@
 
 logger = get_logger(__name__)
 
-DEFAULT_CHANNEL_BUFFER_SIZE = 500
-"""Maximum number of messages that can be buffered. DragonCommChannel will
-raise an exception if no clients consume messages before the buffer is filled."""
-
-LAST_OFFSET = 0
-"""The last offset used to create a local channel. This is used to avoid
-unnecessary retries when creating a local channel."""
-
 
 class DragonCommChannel(cch.CommChannelBase):
     """Passes messages by writing to a Dragon channel."""
diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py
index 254a21c5b..d7787f2ca 100644
--- a/smartsim/_core/mli/comm/channel/dragon_fli.py
+++ b/smartsim/_core/mli/comm/channel/dragon_fli.py
@@ -51,7 +51,7 @@ def __init__(
     ) -> None:
         """Initialize the DragonFLIChannel instance.
 
-        :param fli_desc: The descriptor of the FLI channel to attach
+        :param fli_: The FLIInterface to use as the underlying communications channel
         :param sender_supplied: Flag indicating if the FLI uses sender-supplied streams
         :param buffer_size: Maximum number of sent messages that can be buffered
         """
@@ -79,7 +79,7 @@ def send(self, value: bytes, timeout: float = 0.001) -> None:
                 logger.debug(f"DragonFLIChannel {self.descriptor} sent message")
         except Exception as e:
             raise SmartSimError(
-                f"Error sending message: DragonFLIChannel {self.descriptor}"
+                f"Error sending via DragonFLIChannel {self.descriptor}"
             ) from e
 
     def recv(self, timeout: float = 0.001) -> t.List[bytes]:
@@ -99,6 +99,7 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]:
                     logger.debug(f"DragonFLIChannel {self.descriptor} received message")
                 except fli.FLIEOT:
                     eot = True
+                    logger.debug(f"DragonFLIChannel exhausted: {self.descriptor}")
                 except Exception as e:
                     raise SmartSimError(
                         f"Error receiving messages: DragonFLIChannel {self.descriptor}"
@@ -134,7 +135,8 @@ def from_descriptor(
 
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached DragonFLIChannel
-        :raises SmartSimError: If creation of DragonFLIChanenel fails
+        :raises SmartSimError: If creation of DragonFLIChannel fails
+        :raises ValueError: If the descriptor is invalid
         """
         if not descriptor:
             raise ValueError("Invalid descriptor provided")
diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py
index 258d84b3a..8517979ec 100644
--- a/smartsim/_core/mli/comm/channel/dragon_util.py
+++ b/smartsim/_core/mli/comm/channel/dragon_util.py
@@ -30,10 +30,7 @@
 
 import dragon.channels as dch
 import dragon.fli as fli
-import dragon.infrastructure.facts as df
-import dragon.infrastructure.parameters as dp
 import dragon.managed_memory as dm
-import dragon.utils as du
 
 from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
@@ -54,10 +51,10 @@ def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str
 
     :param channel: The dragon channel to convert
     :returns: The descriptor string
-    :raises SmartSimError: If a dragon channel is not provided
+    :raises ValueError: If a dragon channel is not provided
     """
     if channel is None:
-        raise SmartSimError("Channel is not available to create a descriptor")
+        raise ValueError("Channel is not available to create a descriptor")
 
     serialized_ch = channel.serialize()
     return base64.b64encode(serialized_ch).decode("utf-8")
@@ -67,9 +64,11 @@ def pool_to_descriptor(pool: dm.MemoryPool) -> str:
     """Convert a dragon memory pool to a descriptor string.
 
     :param pool: The memory pool to convert
-    :returns: The descriptor string"""
+    :returns: The descriptor string
+    :raises ValueError: If a memory pool is not provided
+    """
     if pool is None:
-        raise SmartSimError("Memory pool is not available to create a descriptor")
+        raise ValueError("Memory pool is not available to create a descriptor")
 
     serialized_pool = pool.serialize()
     return base64.b64encode(serialized_pool).decode("utf-8")
@@ -82,6 +81,7 @@ def descriptor_to_fli(descriptor: str) -> "fli.FLInterface":
     :param descriptor: The descriptor of an FLI to attach to
     :returns: The attached dragon FLI
     :raises ValueError: If the descriptor is empty or incorrectly formatted
+    :raises SmartSimError: If attachment using the descriptor fails
     """
     if len(descriptor) < 1:
         raise ValueError("Descriptors may not be empty")
@@ -103,7 +103,8 @@ def descriptor_to_channel(descriptor: str) -> dch.Channel:
     :param descriptor: The descriptor of a channel to attach to
     :returns: The attached dragon Channel
     :raises ValueError: If the descriptor is empty or incorrectly formatted
-    :raises SmartSimError: If the descriptor does not attach to a channel"""
+    :raises SmartSimError: If attachment using the descriptor fails
+    """
     if len(descriptor) < 1:
         raise ValueError("Descriptors may not be empty")
 
@@ -122,43 +123,9 @@ def create_local(_capacity: int = 0) -> dch.Channel:
     direct calls to `dch.Channel.make_process_local()` to enable
     supplying a channel capacity.
 
-    :param capacity: The number of events the channel can buffer; uses the default
+    :param _capacity: The number of events the channel can buffer; uses the default
     buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied
     :returns: The instantiated channel
-    :raises SmartSimError: If unable to attach local channel
     """
-    # current implementation has a bug wrt MPI that must be fixed.
-    # falling back to `make_process_local` and disabling buffer size tests
-
-    # pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd))
-    # pool_descriptor = pool_to_descriptor(pool)
-    # channel: t.Optional[dch.Channel] = None
-    # offset = 0
-
-    # global LAST_OFFSET
-    # if LAST_OFFSET:
-    #     offset = LAST_OFFSET
-
-    # capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE
-
-    # while not channel:
-    #     # search for an open channel ID
-    #     offset += 1
-    #     channel_id = df.BASE_USER_MANAGED_CUID + offset
-    #     try:
-    #         channel = dch.Channel(mem_pool=pool, c_uid=channel_id, capacity=capacity)
-    #         LAST_OFFSET = offset
-    #         descriptor = channel_to_descriptor(channel)
-    #         logger.debug(
-    #             "Local channel created: "
-    #             f"{channel_id=}, {pool_descriptor=}, {capacity=}, {descriptor=}"
-    #         )
-    #     except dch.ChannelError as e:
-    #         if offset < 100:
-    #             logger.warning(f"Channnel id `{channel_id}` is not open. Retrying...")
-    #         else:
-    #             LAST_OFFSET = 0
-    #             logger.error(f"All attempts to attach local channel have failed")
-    #             raise SmartSimError("Failed to attach local channel") from e
     channel = dch.Channel.make_process_local()
     return channel
diff --git a/smartsim/_core/mli/infrastructure/comm/__init__.py b/smartsim/_core/mli/infrastructure/comm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/smartsim/_core/mli/infrastructure/comm/broadcaster.py b/smartsim/_core/mli/infrastructure/comm/broadcaster.py
new file mode 100644
index 000000000..d813cce12
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/comm/broadcaster.py
@@ -0,0 +1,238 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+import uuid
+from collections import defaultdict, deque
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.infrastructure.comm.event import EventBase
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class BroadcastResult(t.NamedTuple):
+    """Contains summary details about a broadcast."""
+
+    num_sent: int
+    """The total number of messages delivered across all consumers"""
+    num_failed: int
+    """The total number of messages not delivered across all consumers"""
+
+
+class EventBroadcaster:
+    """Performs fan-out publishing of system events."""
+
+    def __init__(
+        self,
+        backbone: BackboneFeatureStore,
+        channel_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None,
+        name: t.Optional[str] = None,
+    ) -> None:
+        """Initialize the EventPublisher instance.
+
+        :param backbone: The MLI backbone feature store
+        :param channel_factory: Factory method to construct new channel instances
+        """
+        self._backbone = backbone
+        """The backbone feature store used to retrieve consumer descriptors"""
+        self._channel_factory = channel_factory
+        """A factory method used to instantiate channels from descriptors"""
+        self._channel_cache: t.Dict[str, t.Optional[CommChannelBase]] = defaultdict(
+            lambda: None
+        )
+        """A mapping of instantiated channels that can be re-used. Automatically 
+        calls the channel factory if a descriptor is not already in the collection"""
+        self._event_buffer: t.Deque[EventBase] = deque()
+        """A buffer for storing events when a consumer list is not found"""
+        self._descriptors: t.Set[str]
+        """Stores the most recent list of broadcast consumers. Updated automatically
+        on each broadcast"""
+        self._name = name or str(uuid.uuid4())
+        """A unique identifer assigned to the broadcaster for logging"""
+
+    @property
+    def name(self) -> str:
+        """The friendly name assigned to the broadcaster.
+
+        :returns: The broadcaster name if one is assigned, otherwise a unique
+        id assigned by the system.
+        """
+        return self._name
+
+    @property
+    def num_buffered(self) -> int:
+        """Return the number of events currently buffered to send.
+
+        :returns: Number of buffered events
+        """
+        return len(self._event_buffer)
+
+    def _save_to_buffer(self, event: EventBase) -> None:
+        """Places the event in the buffer to be sent once a consumer
+        list is available.
+
+        :param event: The event to buffer
+        :raises ValueError: If the event cannot be buffered
+        """
+        try:
+            self._event_buffer.append(event)
+            logger.debug(f"Buffered event {event=}")
+        except Exception as ex:
+            raise ValueError(
+                f"Unable to buffer event {event} in broadcaster {self.name}"
+            ) from ex
+
+    def _log_broadcast_start(self) -> None:
+        """Logs broadcast statistics."""
+        num_events = len(self._event_buffer)
+        num_copies = len(self._descriptors)
+        logger.debug(
+            f"Broadcast {num_events} events to {num_copies} consumers from {self.name}"
+        )
+
+    def _prune_unused_consumers(self) -> None:
+        """Performs maintenance on the channel cache by pruning any channel
+        that has been removed from the consumers list."""
+        active_consumers = set(self._descriptors)
+        current_channels = set(self._channel_cache.keys())
+
+        # find any cached channels that are now unused
+        inactive_channels = current_channels.difference(active_consumers)
+        new_channels = active_consumers.difference(current_channels)
+
+        for descriptor in inactive_channels:
+            self._channel_cache.pop(descriptor)
+
+        logger.debug(
+            f"Pruning {len(inactive_channels)} stale consumers and"
+            f" found {len(new_channels)} new channels for {self.name}"
+        )
+
+    def _get_comm_channel(self, descriptor: str) -> CommChannelBase:
+        """Helper method to build and cache a comm channel.
+
+        :param descriptor: The descriptor to pass to the channel factory
+        :returns: The instantiated channel
+        :raises SmartSimError: If the channel fails to attach
+        """
+        comm_channel = self._channel_cache[descriptor]
+        if comm_channel is not None:
+            return comm_channel
+
+        if self._channel_factory is None:
+            raise SmartSimError("No channel factory provided for consumers")
+
+        try:
+            channel = self._channel_factory(descriptor)
+            self._channel_cache[descriptor] = channel
+            return channel
+        except Exception as ex:
+            msg = f"Unable to construct channel with descriptor: {descriptor}"
+            logger.error(msg, exc_info=True)
+            raise SmartSimError(msg) from ex
+
+    def _get_next_event(self) -> t.Optional[EventBase]:
+        """Pop the next event to be sent from the queue.
+
+        :returns: The next event to send if any events are enqueued, otherwise `None`.
+        """
+        try:
+            return self._event_buffer.popleft()
+        except IndexError:
+            logger.debug(f"Broadcast buffer exhausted for {self.name}")
+
+        return None
+
+    def _broadcast(self, timeout: float = 0.001) -> BroadcastResult:
+        """Broadcasts all buffered events to registered event consumers.
+
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :returns: BroadcastResult containing the number of messages that were
+        successfully and unsuccessfully sent for all consumers
+        :raises SmartSimError: If the channel fails to attach
+        :raises SmartSimError: If broadcasting fails
+        """
+        # allow descriptors to be empty since events are buffered
+        self._descriptors = set(x for x in self._backbone.notification_channels if x)
+        if not self._descriptors:
+            msg = f"No event consumers are registered for {self.name}"
+            logger.warning(msg)
+            return BroadcastResult(0, 0)
+
+        self._prune_unused_consumers()
+        self._log_broadcast_start()
+
+        num_listeners = len(self._descriptors)
+        num_sent = 0
+        num_failures = 0
+
+        # send each event to every consumer
+        while event := self._get_next_event():
+            logger.debug(f"Broadcasting {event=} to {num_listeners} listeners")
+            event_bytes = bytes(event)
+
+            for i, descriptor in enumerate(self._descriptors):
+                comm_channel = self._get_comm_channel(descriptor)
+
+                try:
+                    comm_channel.send(event_bytes, timeout)
+                    num_sent += 1
+                except Exception:
+                    msg = (
+                        f"Broadcast {i+1}/{num_listeners} for event {event.uid} to "
+                        f"channel {descriptor} from {self.name} failed."
+                    )
+                    logger.exception(msg)
+                    num_failures += 1
+
+        return BroadcastResult(num_sent, num_failures)
+
+    def send(self, event: EventBase, timeout: float = 0.001) -> int:
+        """Implementation of `send` method of the `EventPublisher` protocol. Publishes
+        the supplied event to all registered broadcast consumers.
+
+        :param event: An event to publish
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :returns: The total number of events successfully published to consumers
+        :raises ValueError: If event serialization fails
+        :raises AttributeError: If event cannot be serialized
+        :raises KeyError: If channel fails to attach using registered descriptors
+        :raises SmartSimError: If any unexpected error occurs during send
+        """
+        try:
+            self._save_to_buffer(event)
+            result = self._broadcast(timeout)
+            return result.num_sent
+        except (KeyError, ValueError, AttributeError, SmartSimError):
+            raise
+        except Exception as ex:
+            raise SmartSimError("An unexpected failure occurred while sending") from ex
diff --git a/smartsim/_core/mli/infrastructure/comm/consumer.py b/smartsim/_core/mli/infrastructure/comm/consumer.py
new file mode 100644
index 000000000..3e03ba86c
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/comm/consumer.py
@@ -0,0 +1,283 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pickle
+import time
+import typing as t
+import uuid
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.infrastructure.comm.event import (
+    EventBase,
+    OnCreateConsumer,
+    OnRemoveConsumer,
+    OnShutdownRequested,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class EventConsumer:
+    """Reads system events published to a communications channel."""
+
+    _BACKBONE_WAIT_TIMEOUT = 10.0
+    """Maximum time (in seconds) to wait for the backbone to register the consumer"""
+
+    def __init__(
+        self,
+        comm_channel: CommChannelBase,
+        # channel_factory: ...,
+        backbone: BackboneFeatureStore,
+        filters: t.Optional[t.List[str]] = None,
+        name: t.Optional[str] = None,
+        event_handler: t.Optional[t.Callable[[EventBase], None]] = None,
+    ) -> None:
+        """Initialize the EventConsumer instance.
+
+        :param comm_channel: Communications channel to listen to for events
+        :param backbone: The MLI backbone feature store
+        :param filters: A list of event types to deliver. when empty, all
+        events will be delivered
+        :param name: A user-friendly name for logging. If not provided, an
+        auto-generated GUID will be used
+        :raises ValueError: If batch_timeout <= 0
+        """
+        self._comm_channel = comm_channel
+        """The comm channel used by the consumer to receive messages. The channel
+        descriptor will be published for senders to discover."""
+        self._backbone = backbone
+        """The backbone instance used to bootstrap the instance. The EventConsumer
+        uses the backbone to discover where it can publish its descriptor."""
+        self._global_filters = filters or []
+        """A set of global filters to apply to incoming events. Global filters are
+        combined with per-call filters. Filters act as an allow-list."""
+        self._name = name or str(uuid.uuid4())
+        """User-friendly name assigned to a consumer for logging. Automatically
+        assigned if not provided."""
+        self._event_handler = event_handler
+        """The function that should be executed when an event
+        passed by the filters is received."""
+        self.listening = True
+        """Flag indicating that the consumer is currently listening for new
+        events. Setting this flag to `False` will cause any active calls to
+        `listen` to terminate."""
+
+    @property
+    def descriptor(self) -> str:
+        """The descriptor of the underlying comm channel.
+
+        :returns: The comm channel descriptor"""
+        return self._comm_channel.descriptor
+
+    @property
+    def name(self) -> str:
+        """The friendly name assigned to the consumer.
+
+        :returns: The consumer name if one is assigned, otherwise a unique
+        id assigned by the system.
+        """
+        return self._name
+
+    def recv(
+        self,
+        filters: t.Optional[t.List[str]] = None,
+        timeout: float = 0.001,
+        batch_timeout: float = 1.0,
+    ) -> t.List[EventBase]:
+        """Receives available published event(s).
+
+        :param filters: Additional filters to add to the global filters configured
+        on the EventConsumer instance
+        :param timeout: Maximum time to wait for a single message to arrive
+        :param batch_timeout: Maximum time to wait for messages to arrive; allows
+        multiple batches to be retrieved in one call to `send`
+        :returns: A list of events that pass any configured filters
+        :raises ValueError: If a positive, non-zero value is not provided for the
+        timeout or batch_timeout.
+        """
+        if filters is None:
+            filters = []
+
+        if timeout is not None and timeout <= 0:
+            raise ValueError("request timeout must be a non-zero, positive value")
+
+        if batch_timeout is not None and batch_timeout <= 0:
+            raise ValueError("batch_timeout must be a non-zero, positive value")
+
+        filter_set = {*self._global_filters, *filters}
+        all_message_bytes: t.List[bytes] = []
+
+        # firehose as many messages as possible within the batch_timeout
+        start_at = time.time()
+        remaining = batch_timeout
+
+        batch_message_bytes = self._comm_channel.recv(timeout=timeout)
+        while batch_message_bytes:
+            # remove any empty messages that will fail to decode
+            all_message_bytes.extend(batch_message_bytes)
+            batch_message_bytes = []
+
+            # avoid getting stuck indefinitely waiting for the channel
+            elapsed = time.time() - start_at
+            remaining = batch_timeout - elapsed
+
+            if remaining > 0:
+                batch_message_bytes = self._comm_channel.recv(timeout=timeout)
+
+        events_received: t.List[EventBase] = []
+
+        # Timeout elapsed or no messages received - return the empty list
+        if not all_message_bytes:
+            return events_received
+
+        for message in all_message_bytes:
+            if not message or message is None:
+                continue
+
+            event = pickle.loads(message)
+            if not event:
+                logger.warning(f"Consumer {self.name} is unable to unpickle message")
+                continue
+
+            # skip events that don't pass a filter
+            if filter_set and event.category not in filter_set:
+                continue
+
+            events_received.append(event)
+
+        return events_received
+
+    def _send_to_registrar(self, event: EventBase) -> None:
+        """Send an event direct to the registrar listener."""
+        registrar_key = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER
+        config = self._backbone.wait_for([registrar_key], self._BACKBONE_WAIT_TIMEOUT)
+        registrar_descriptor = str(config.get(registrar_key, None))
+
+        if not registrar_descriptor:
+            logger.warning(
+                f"Unable to send {event.category} from {self.name}. "
+                "No registrar channel found."
+            )
+            return
+
+        logger.debug(f"Sending {event.category} from {self.name}")
+
+        registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor)
+        registrar_channel.send(bytes(event), timeout=1.0)
+
+        logger.debug(f"{event.category} from {self.name} sent")
+
+    def register(self) -> None:
+        """Send an event to register this consumer as a listener."""
+        descriptor = self._comm_channel.descriptor
+        event = OnCreateConsumer(self.name, descriptor, self._global_filters)
+
+        self._send_to_registrar(event)
+
+    def unregister(self) -> None:
+        """Send an event to un-register this consumer as a listener."""
+        descriptor = self._comm_channel.descriptor
+        event = OnRemoveConsumer(self.name, descriptor)
+
+        self._send_to_registrar(event)
+
+    def _on_handler_missing(self, event: EventBase) -> None:
+        """A "dead letter" event handler that is called to perform
+        processing on events before they're discarded.
+
+        :param event: The event to handle
+        """
+        logger.warning(
+            "No event handler is registered in consumer "
+            f"{self.name}. Discarding {event=}"
+        )
+
+    def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None:
+        """Receives messages for the consumer a single time. Delivers
+        all messages that pass the consumer filters. Shutdown requests
+        are handled by a default event handler.
+
+
+        NOTE: Executes a single batch-retrieval to receive the maximum
+        number of messages available under batch timeout. To continually
+        listen, use `listen` in a non-blocking thread/process
+
+        :param timeout: Maximum time to wait (in seconds) for a message to arrive
+        :param timeout: Maximum time to wait (in seconds) for a batch to arrive
+        """
+        logger.info(
+            f"Consumer {self.name} listening with {timeout} second timeout"
+            f" on channel {self._comm_channel.descriptor}"
+        )
+
+        if not self._event_handler:
+            logger.info("Unable to handle messages. No event handler is registered.")
+
+        incoming_messages = self.recv(timeout=timeout, batch_timeout=batch_timeout)
+
+        if not incoming_messages:
+            logger.info(f"Consumer {self.name} received empty message list")
+
+        for message in incoming_messages:
+            logger.info(f"Consumer {self.name} is handling event {message=}")
+            self._handle_shutdown(message)
+
+            if self._event_handler:
+                self._event_handler(message)
+            else:
+                self._on_handler_missing(message)
+
+    def _handle_shutdown(self, event: EventBase) -> bool:
+        """Handles shutdown requests sent to the consumer by setting the
+        `self.listener` property to `False`.
+
+        :param event: The event to handle
+        :returns: A bool indicating if the event was a shutdown request
+        """
+        if isinstance(event, OnShutdownRequested):
+            logger.debug(f"Shutdown requested from: {event.source}")
+            self.listening = False
+            return True
+        return False
+
+    def listen(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None:
+        """Receives messages for the consumer until a shutdown request is received.
+
+        :param timeout: Maximum time to wait (in seconds) for a message to arrive
+        :param batch_timeout: Maximum time to wait (in seconds) for a batch to arrive
+        """
+
+        logger.debug(f"Consumer {self.name} is now listening for events.")
+
+        while self.listening:
+            self.listen_once(timeout, batch_timeout)
+
+        logger.debug(f"Consumer {self.name} is no longer listening.")
diff --git a/smartsim/_core/mli/infrastructure/comm/event.py b/smartsim/_core/mli/infrastructure/comm/event.py
new file mode 100644
index 000000000..ccef9f9b8
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/comm/event.py
@@ -0,0 +1,162 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pickle
+import typing as t
+import uuid
+from dataclasses import dataclass, field
+
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class EventBase:
+    """Core API for an event."""
+
+    category: str
+    """Unique category name for an event class"""
+    source: str
+    """A unique identifier for the publisher of the event"""
+    uid: str = field(default_factory=lambda: str(uuid.uuid4()))
+    """A unique identifier for this event"""
+
+    def __bytes__(self) -> bytes:
+        """Default conversion to bytes for an event required to publish
+        messages using byte-oriented communication channels.
+
+        :returns: This entity encoded as bytes"""
+        return pickle.dumps(self)
+
+    def __str__(self) -> str:
+        """Convert the event to a string.
+
+        :returns: A string representation of this instance"""
+        return f"{self.uid}|{self.category}"
+
+
+class OnShutdownRequested(EventBase):
+    """Publish this event to trigger the listener to shutdown."""
+
+    SHUTDOWN: t.ClassVar[str] = "consumer-unregister"
+    """Unique category name for an event raised when a new consumer is unregistered"""
+
+    def __init__(self, source: str) -> None:
+        """Initialize the event instance.
+
+        :param source: A unique identifier for the publisher of the event
+        creating the event
+        """
+        super().__init__(self.SHUTDOWN, source)
+
+
+class OnCreateConsumer(EventBase):
+    """Publish this event when a new event consumer registration is required."""
+
+    descriptor: str
+    """Descriptor of the comm channel exposed by the consumer"""
+    filters: t.List[str] = field(default_factory=list)
+    """The collection of filters indicating messages of interest to this consumer"""
+
+    CONSUMER_CREATED: t.ClassVar[str] = "consumer-created"
+    """Unique category name for an event raised when a new consumer is registered"""
+
+    def __init__(self, source: str, descriptor: str, filters: t.Sequence[str]) -> None:
+        """Initialize the event instance.
+
+        :param source: A unique identifier for the publisher of the event
+        :param descriptor: Descriptor of the comm channel exposed by the consumer
+        :param filters: Collection of filters indicating messages of interest
+        """
+        super().__init__(self.CONSUMER_CREATED, source)
+        self.descriptor = descriptor
+        self.filters = list(filters)
+
+    def __str__(self) -> str:
+        """Convert the event to a string.
+
+        :returns: A string representation of this instance
+        """
+        _filters = ",".join(self.filters)
+        return f"{str(super())}|{self.descriptor}|{_filters}"
+
+
+class OnRemoveConsumer(EventBase):
+    """Publish this event when a consumer is shutting down and
+    should be removed from notification lists."""
+
+    descriptor: str
+    """Descriptor of the comm channel exposed by the consumer"""
+
+    CONSUMER_REMOVED: t.ClassVar[str] = "consumer-removed"
+    """Unique category name for an event raised when a new consumer is unregistered"""
+
+    def __init__(self, source: str, descriptor: str) -> None:
+        """Initialize the OnRemoveConsumer event.
+
+        :param source: A unique identifier for the publisher of the event
+        :param descriptor: Descriptor of the comm channel exposed by the consumer
+        """
+        super().__init__(self.CONSUMER_REMOVED, source)
+        self.descriptor = descriptor
+
+    def __str__(self) -> str:
+        """Convert the event to a string.
+
+        :returns: A string representation of this instance
+        """
+        return f"{str(super())}|{self.descriptor}"
+
+
+class OnWriteFeatureStore(EventBase):
+    """Publish this event when a feature store key is written."""
+
+    descriptor: str
+    """The descriptor of the feature store where the write occurred"""
+    key: str
+    """The key identifying where the write occurred"""
+
+    FEATURE_STORE_WRITTEN: str = "feature-store-written"
+    """Event category for an event raised when a feature store key is written"""
+
+    def __init__(self, source: str, descriptor: str, key: str) -> None:
+        """Initialize the OnWriteFeatureStore event.
+
+        :param source: A unique identifier for the publisher of the event
+        :param descriptor: The descriptor of the feature store where the write occurred
+        :param key: The key identifying where the write occurred
+        """
+        super().__init__(self.FEATURE_STORE_WRITTEN, source)
+        self.descriptor = descriptor
+        self.key = key
+
+    def __str__(self) -> str:
+        """Convert the event to a string.
+
+        :returns: A string representation of this instance
+        """
+        return f"{str(super())}|{self.descriptor}|{self.key}"
diff --git a/smartsim/_core/mli/infrastructure/comm/producer.py b/smartsim/_core/mli/infrastructure/comm/producer.py
new file mode 100644
index 000000000..2d8a7c14a
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/comm/producer.py
@@ -0,0 +1,44 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+from smartsim._core.mli.infrastructure.comm.event import EventBase
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class EventProducer(t.Protocol):
+    """Core API of a class that publishes events."""
+
+    def send(self, event: EventBase, timeout: float = 0.001) -> int:
+        """Send an event using the configured comm channel.
+
+        :param event: The event to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :returns: The number of messages that were sent
+        """
diff --git a/smartsim/_core/mli/infrastructure/control/event_listener.py b/smartsim/_core/mli/infrastructure/control/listener.py
similarity index 84%
rename from smartsim/_core/mli/infrastructure/control/event_listener.py
rename to smartsim/_core/mli/infrastructure/control/listener.py
index 2485f77ea..b5c529615 100644
--- a/smartsim/_core/mli/infrastructure/control/event_listener.py
+++ b/smartsim/_core/mli/infrastructure/control/listener.py
@@ -27,11 +27,9 @@
 # isort: off
 # pylint: disable=import-error
 # pylint: disable=unused-import
+import socket
 import dragon
 
-# from dragon.globalservices.api_setup import connect_to_infrastructure
-
-
 # pylint: enable=unused-import
 # pylint: enable=import-error
 # isort: on
@@ -45,13 +43,15 @@
 from smartsim._core.entrypoints.service import Service
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_util import create_local
-from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
-    BackboneFeatureStore,
+from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer
+from smartsim._core.mli.infrastructure.comm.event import (
     EventBase,
-    EventCategory,
-    EventConsumer,
     OnCreateConsumer,
     OnRemoveConsumer,
+    OnShutdownRequested,
+)
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
 )
 from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
@@ -60,8 +60,9 @@
 
 
 class ConsumerRegistrationListener(Service):
-    """A long-running service that listens for events of a specific type
-    and executes the appropriate event handler."""
+    """A long-running service that manages the list of consumers receiving
+    events that are broadcast. It hosts handlers for adding and removing consumers
+    """
 
     def __init__(
         self,
@@ -78,7 +79,6 @@ def __init__(
         :param timeout: Maximum time (in seconds) to allow a single recv request to wait
         :param batch_timeout: Maximum time (in seconds) to allow a batch of receives to
          continue to build
-        :param filters: Filters specifying the message types to handle
         :param as_service: Specifies run-once or run-until-complete behavior of service
         :param cooldown: Number of seconds to wait before shutting down after
         shutdown criteria are met
@@ -86,17 +86,13 @@ def __init__(
         super().__init__(
             as_service, cooldown, health_check_frequency=health_check_frequency
         )
-
         self._timeout = timeout
         """ Maximum time (in seconds) to allow a single recv request to wait"""
-
         self._batch_timeout = batch_timeout
         """Maximum time (in seconds) to allow a batch of receives to
          continue to build"""
-
         self._consumer: t.Optional[EventConsumer] = None
         """The event consumer that handles receiving events"""
-
         self._backbone = backbone
         """A standalone, system-created feature store used to share internal
         information among MLI components"""
@@ -112,8 +108,20 @@ def _on_shutdown(self) -> None:
         the main event loop during automatic shutdown."""
         super()._on_shutdown()
 
-        # unregister this listener in the backbone
-        self._backbone.pop(BackboneFeatureStore.MLI_BACKEND_CONSUMER)
+        if not self._consumer:
+            return
+
+        # remove descriptor for this listener from the backbone if it's there
+        if registered_consumer := self._backbone.backend_channel:
+            # if there is a descriptor in the backbone and it's still this listener
+            if registered_consumer == self._consumer.descriptor:
+                logger.info(
+                    f"Listener clearing backend consumer {self._consumer.name} "
+                    "from backbone"
+                )
+
+                # unregister this listener in the backbone
+                self._backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
 
         # TODO: need the channel to be cleaned up
         # self._consumer._comm_channel._channel.destroy()
@@ -135,15 +143,18 @@ def _can_shutdown(self) -> bool:
         """
 
         if self._backbone is None:
-            logger.info("Listener must shutdown: no backbone attached")
+            logger.info("Listener must shutdown. No backbone attached")
             return True
 
         if self._consumer is None:
-            logger.info("Listener must shutdown: no consumer channel created")
+            logger.info("Listener must shutdown. No consumer channel created")
             return True
 
         if not self._consumer.listening:
-            logger.info("Listener can shutdown: consumer is not listening")
+            logger.info(
+                f"Listener can shutdown. Consumer `{self._consumer.name}` "
+                "is not listening"
+            )
             return True
 
         return False
@@ -202,7 +213,7 @@ def _on_health_check(self) -> None:
 
         try:
             logger.debug("Retrieving registered listener descriptor")
-            descriptor = self._backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+            descriptor = self._backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
         except KeyError:
             descriptor = None
             if self._consumer:
@@ -210,8 +221,8 @@ def _on_health_check(self) -> None:
 
         if self._consumer and descriptor != self._consumer.descriptor:
             logger.warning(
-                "This listener is no longer registered. It "
-                "will automatically shut down."
+                f"Consumer `{self._consumer.name}` for `ConsumerRegistrationListener` "
+                "is no longer registered. It will automatically shut down."
             )
             self._consumer.listening = False
 
@@ -221,7 +232,8 @@ def _publish_consumer(self) -> None:
             logger.warning("No registrar consumer descriptor available to publisher")
             return
 
-        self._backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = (
+        logger.debug(f"Publishing {self._consumer.descriptor} to backbone")
+        self._backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = (
             self._consumer.descriptor
         )
 
@@ -235,6 +247,7 @@ def _create_eventing(self) -> EventConsumer:
         NOTE: the backbone must be initialized before connecting eventing clients.
 
         :returns: The newly created EventConsumer instance
+        :raises SmartSimError: If a listener channel cannot be created
         """
 
         if self._consumer:
@@ -253,8 +266,12 @@ def _create_eventing(self) -> EventConsumer:
         self._consumer = EventConsumer(
             event_channel,
             self._backbone,
-            [EventCategory.CONSUMER_CREATED, EventCategory.CONSUMER_REMOVED],
-            name="ConsumerRegistrar",
+            [
+                OnCreateConsumer.CONSUMER_CREATED,
+                OnRemoveConsumer.CONSUMER_REMOVED,
+                OnShutdownRequested.SHUTDOWN,
+            ],
+            name=f"ConsumerRegistrar.{socket.gethostname()}",
             event_handler=self._on_event_received,
         )
         self._publish_consumer()
diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
index d14755f53..b0f931cb3 100644
--- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
@@ -146,7 +146,7 @@ def ready(self) -> bool:
             return False
 
         timed_out = False
-        if self._batch_timeout > 0:
+        if self._batch_timeout >= 0:
             timed_out = self._elapsed_time >= self._batch_timeout
 
         if self.full():
diff --git a/smartsim/_core/mli/infrastructure/environment_loader.py b/smartsim/_core/mli/infrastructure/environment_loader.py
index 2c89184d8..5ba0fccc2 100644
--- a/smartsim/_core/mli/infrastructure/environment_loader.py
+++ b/smartsim/_core/mli/infrastructure/environment_loader.py
@@ -39,6 +39,11 @@ class EnvironmentConfigLoader:
     Facilitates the loading of a FeatureStore and Queue into the WorkerManager.
     """
 
+    REQUEST_QUEUE_ENV_VAR = "_SMARTSIM_REQUEST_QUEUE"
+    """The environment variable that holds the request queue descriptor"""
+    BACKBONE_ENV_VAR = "_SMARTSIM_INFRA_BACKBONE"
+    """The environment variable that holds the backbone descriptor"""
+
     def __init__(
         self,
         featurestore_factory: t.Callable[[str], FeatureStore],
@@ -76,7 +81,7 @@ def get_backbone(self) -> t.Optional[FeatureStore]:
 
         :returns: The attached feature store via `_SMARTSIM_INFRA_BACKBONE`
         """
-        descriptor = os.getenv("_SMARTSIM_INFRA_BACKBONE", "")
+        descriptor = os.getenv(self.BACKBONE_ENV_VAR, "")
 
         if not descriptor:
             logger.warning("No backbone descriptor is configured")
@@ -97,7 +102,7 @@ def get_queue(self) -> t.Optional[CommChannelBase]:
 
         :returns: The attached queue specified via `_SMARTSIM_REQUEST_QUEUE`
         """
-        descriptor = os.getenv("_SMARTSIM_REQUEST_QUEUE", "")
+        descriptor = os.getenv(self.REQUEST_QUEUE_ENV_VAR, "")
 
         if not descriptor:
             logger.warning("No queue descriptor is configured")
diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
index 21fdecbed..b12d7b11b 100644
--- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py
@@ -24,15 +24,10 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import enum
 import itertools
 import os
-import pickle
 import time
 import typing as t
-import uuid
-from collections import defaultdict, deque
-from dataclasses import dataclass
 
 # pylint: disable=import-error
 # isort: off
@@ -40,8 +35,6 @@
 
 # isort: on
 
-from smartsim._core.mli.comm.channel.channel import CommChannelBase
-from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
@@ -56,15 +49,23 @@ class BackboneFeatureStore(DragonFeatureStore):
     information stored in the MLI backbone feature store."""
 
     MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS"
-    MLI_BACKEND_CONSUMER = "_SMARTIM_MLI_BACKEND_CONSUMER"
+    """Unique key used in the backbone to locate the consumer list"""
+    MLI_REGISTRAR_CONSUMER = "_SMARTIM_MLI_REGISTRAR_CONSUMER"
+    """Unique key used in the backbone to locate the registration consumer"""
     MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE"
+    """Unique key used in the backbone to locate MLI work queue"""
     MLI_BACKBONE = "_SMARTSIM_INFRA_BACKBONE"
+    """Unique key used in the backbone to locate the backbone feature store"""
     _CREATED_ON = "creation"
+    """Unique key used in the backbone to locate the creation date of the
+    feature store"""
     _DEFAULT_WAIT_TIMEOUT = 1.0
+    """The default wait time (in seconds) for blocking requests to
+    the feature store"""
 
     def __init__(
         self,
-        storage: "dragon_ddict.DDict",
+        storage: dragon_ddict.DDict,
         allow_reserved_writes: bool = False,
     ) -> None:
         """Initialize the DragonFeatureStore instance.
@@ -119,24 +120,23 @@ def notification_channels(self, values: t.Sequence[str]) -> None:
 
     @property
     def backend_channel(self) -> t.Optional[str]:
-        """Retrieve the channel descriptor exposed by the MLI backend for events.
+        """Retrieve the channel descriptor used to register event consumers.
 
         :returns: The channel descriptor"""
-        if self.MLI_BACKEND_CONSUMER in self:
-            return str(self[self.MLI_BACKEND_CONSUMER])
+        if self.MLI_REGISTRAR_CONSUMER in self:
+            return str(self[self.MLI_REGISTRAR_CONSUMER])
         return None
 
     @backend_channel.setter
     def backend_channel(self, value: str) -> None:
-        """Set the channel exposed by the MLI backend for events.
+        """Set the channel used to register event consumers.
 
         :param value: The stringified channel descriptor"""
-        self[self.MLI_BACKEND_CONSUMER] = value
+        self[self.MLI_REGISTRAR_CONSUMER] = value
 
     @property
     def worker_queue(self) -> t.Optional[str]:
-        """Retrieve the channel descriptor exposed by the MLI
-        backend to send work to an MLI worker manager instance.
+        """Retrieve the channel descriptor used to send work to MLI worker managers.
 
         :returns: The channel descriptor, if found. Otherwise, `None`"""
         if self.MLI_WORKER_QUEUE in self:
@@ -145,8 +145,7 @@ def worker_queue(self) -> t.Optional[str]:
 
     @worker_queue.setter
     def worker_queue(self, value: str) -> None:
-        """Set the channel descriptor exposed by the MLI
-        backend to send work to an MLI worker manager instance.
+        """Set the channel descriptor used to send work to MLI worker managers.
 
         :param value: The channel descriptor"""
         self[self.MLI_WORKER_QUEUE] = value
@@ -195,6 +194,8 @@ def _check_wait_timeout(
         :param start_time: the start time to use for elapsed calculation
         :param timeout: the timeout (in seconds)
         :param indicators: latest retrieval status for requested keys
+        :raises SmartSimError: If the timeout elapses before all values are
+        retrieved
         """
         elapsed = time.time() - start_time
         if timeout and elapsed > timeout:
@@ -211,6 +212,9 @@ def wait_for(
 
         :param keys: The required collection of keys to retrieve
         :param timeout: The maximum wait time in seconds
+        :returns: Dictionary containing the keys and values requested
+        :raises SmartSimError: If the timeout elapses without retrieving
+         all requested keys
         """
         if timeout < 0:
             timeout = self._DEFAULT_WAIT_TIMEOUT
@@ -253,579 +257,3 @@ def get_env(self) -> t.Dict[str, str]:
         :returns: The dictionary populated with env vars
         """
         return {self.MLI_BACKBONE: self.descriptor}
-
-
-class EventCategory(str, enum.Enum):
-    """Predefined event types raised by SmartSim backend."""
-
-    CONSUMER_CREATED: str = "consumer-created"
-    """Event category for an event raised when a new consumer is created"""
-    CONSUMER_REMOVED: str = "consumer-removed"
-    """Event category for an event raised when a new consumer is created"""
-    FEATURE_STORE_WRITTEN: str = "feature-store-written"
-    """Event category for an event raised when a feature store key is written"""
-    SHUTDOWN: str = "shutdown"
-    """Event category for an event that should trigger the listener to shutdown"""
-
-
-@dataclass
-class EventBase:
-    """Core API for an event."""
-
-    # todo: shift eventing code to: infrastructure / event / event.py
-    category: EventCategory
-    """The event category for this event; may be used for addressing,
-    prioritization, or filtering of events by a event publisher/consumer"""
-
-    uid: str
-    """A unique identifier for this event"""
-
-    def __bytes__(self) -> bytes:
-        """Default conversion to bytes for an event required to publish
-        messages using byte-oriented communication channels.
-
-        :returns: This entity encoded as bytes"""
-        return pickle.dumps(self)
-
-    def __str__(self) -> str:
-        """Convert the event to a string.
-
-        :returns: A string representation of this instance"""
-        return f"{self.uid}|{self.category}"
-
-
-class OnShutdownRequested(EventBase):
-    """Publish this event to trigger the listener to shutdown."""
-
-    def __init__(self) -> None:
-        """Initialize the OnShutdownRequest event."""
-        super().__init__(EventCategory.SHUTDOWN, str(uuid.uuid4()))
-
-
-class OnCreateConsumer(EventBase):
-    """Publish this event when a new event consumer registration is required."""
-
-    descriptor: str
-    """Descriptor of the comm channel exposed by the consumer"""
-    filters: t.List[EventCategory]
-    """The collection of filters indicating messages of interest to this consumer"""
-
-    def __init__(self, descriptor: str, filters: t.Sequence[EventCategory]) -> None:
-        """Initialize the OnCreateConsumer event.
-
-        :param descriptor: Descriptor of the comm channel exposed by the consumer
-        :param descriptor: Collection of filters indicating messages of interest
-        """
-        super().__init__(EventCategory.CONSUMER_CREATED, str(uuid.uuid4()))
-        self.descriptor = descriptor
-        self.filters = list(filters)
-
-    def __str__(self) -> str:
-        """Convert the event to a string.
-
-        :returns: A string representation of this instance
-        """
-        _filters = ",".join(self.filters)
-        return f"{str(super())}|{self.descriptor}|{_filters}"
-
-
-class OnRemoveConsumer(EventBase):
-    """Publish this event when a consumer is shutting down and
-    should be removed from notification lists."""
-
-    descriptor: str
-    """Descriptor of the comm channel exposed by the consumer"""
-
-    def __init__(self, descriptor: str) -> None:
-        """Initialize the OnRemoveConsumer event.
-
-        :param descriptor: Descriptor of the comm channel exposed by the consumer
-        """
-        super().__init__(EventCategory.CONSUMER_REMOVED, str(uuid.uuid4()))
-        self.descriptor = descriptor
-
-    def __str__(self) -> str:
-        """Convert the event to a string.
-
-        :returns: A string representation of this instance
-        """
-        return f"{str(super())}|{self.descriptor}"
-
-
-class OnWriteFeatureStore(EventBase):
-    """Publish this event when a feature store key is written."""
-
-    descriptor: str
-    """The descriptor of the feature store where the write occurred"""
-
-    key: str
-    """The key identifying where the write occurred"""
-
-    def __init__(self, descriptor: str, key: str) -> None:
-        """Initialize the OnWriteFeatureStore event.
-
-        :param descriptor: The descriptor of the feature store where the write occurred
-        :param key: The key identifying where the write occurred
-        """
-        super().__init__(EventCategory.FEATURE_STORE_WRITTEN, str(uuid.uuid4()))
-        self.descriptor = descriptor
-        self.key = key
-
-    def __str__(self) -> str:
-        """Convert the event to a string.
-
-        :returns: A string representation of this instance
-        """
-        return f"{str(super())}|{self.descriptor}|{self.key}"
-
-
-class EventProducer(t.Protocol):
-    """Core API of a class that publishes events."""
-
-    def send(self, event: EventBase, timeout: float = 0.001) -> int:
-        """Send an event using the configured comm channel.
-
-        :param event: The event to send
-        :param timeout: Maximum time to wait (in seconds) for messages to send
-        :returns: The number of messages that were sent
-        """
-
-
-class EventSender:
-    """An event publisher that performs publishing of system events to a
-    single endpoint"""
-
-    def __init__(
-        self,
-        backbone: BackboneFeatureStore,
-        channel: t.Optional[CommChannelBase],
-    ) -> None:
-        """Initialize the instance.
-
-        :param backbone: The backbone feature store to use
-        :param channel: The comm channel to send events on
-        """
-        self._backbone = backbone
-        self._channel: t.Optional[CommChannelBase] = channel
-
-    def send(self, event: EventBase, timeout: float = 0.001) -> int:
-        """Send an event using the configured comm channel.
-
-        :param event: The event to send
-        :param timeout: Maximum time to wait (in seconds) for messages to send
-        :returns: The number of message copies that were sent
-        :raises SmartSimError: If the comm channel is not configured
-        """
-        if self._channel is None:
-            raise SmartSimError("No channel to send on")
-        num_sent = 0
-
-        logger.debug(f"Sending {event} to {self._channel.descriptor}")
-
-        try:
-            event_bytes = bytes(event)
-            self._channel.send(event_bytes, timeout)
-            num_sent += 1
-        except Exception as ex:
-            raise SmartSimError(f"Failed broadcast to channel: {self._channel}") from ex
-
-        return num_sent
-
-
-class EventBroadcaster:
-    """Performs fan-out publishing of system events."""
-
-    def __init__(
-        self,
-        backbone: BackboneFeatureStore,
-        channel_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None,
-    ) -> None:
-        """Initialize the EventPublisher instance.
-
-        :param backbone: The MLI backbone feature store
-        :param channel_factory: Factory method to construct new channel instances
-        """
-        self._backbone = backbone
-        """The backbone feature store used to retrieve consumer descriptors"""
-        self._channel_factory = channel_factory
-        """A factory method used to instantiate channels from descriptors"""
-        self._channel_cache: t.Dict[str, t.Optional[CommChannelBase]] = defaultdict(
-            lambda: None
-        )
-        """A mapping of instantiated channels that can be re-used. Automatically 
-        calls the channel factory if a descriptor is not already in the collection"""
-        self._event_buffer: t.Deque[EventBase] = deque()
-        """A buffer for storing events when a consumer list is not found"""
-        self._descriptors: t.Set[str]
-        """Stores the most recent list of broadcast consumers. Updated automatically
-        on each broadcast"""
-        self._uid = str(uuid.uuid4())
-        """A unique identifer assigned to the broadcaster for logging"""
-
-    @property
-    def num_buffered(self) -> int:
-        """Return the number of events currently buffered to send.
-
-        :returns: Number of buffered events
-        """
-        return len(self._event_buffer)
-
-    def _save_to_buffer(self, event: EventBase) -> None:
-        """Places the event in the buffer to be sent once a consumer
-        list is available.
-
-        :param event: The event to buffer
-        :raises ValueError: If the event cannot be buffered
-        """
-        try:
-            self._event_buffer.append(event)
-            logger.debug(f"Buffered event {event=}")
-        except Exception as ex:
-            raise ValueError(f"Unable to serialize event from {self._uid}") from ex
-
-    def _log_broadcast_start(self) -> None:
-        """Logs broadcast statistics."""
-        num_events = len(self._event_buffer)
-        num_copies = len(self._descriptors)
-        logger.debug(
-            f"Broadcast {num_events} events to {num_copies} consumers from {self._uid}"
-        )
-
-    def _prune_unused_consumers(self) -> None:
-        """Performs maintenance on the channel cache by pruning any channel
-        that has been removed from the consumers list."""
-        active_consumers = set(self._descriptors)
-        current_channels = set(self._channel_cache.keys())
-
-        # find any cached channels that are now unused
-        inactive_channels = current_channels.difference(active_consumers)
-        new_channels = active_consumers.difference(current_channels)
-
-        for descriptor in inactive_channels:
-            self._channel_cache.pop(descriptor)
-
-        logger.debug(
-            f"Pruning {len(inactive_channels)} stale consumers and"
-            f" found {len(new_channels)} new channels for {self._uid}"
-        )
-
-    def _get_comm_channel(self, descriptor: str) -> CommChannelBase:
-        """Helper method to build and cache a comm channel.
-
-        :param descriptor: The descriptor to pass to the channel factory
-        :returns: The instantiated channel
-        :raises SmartSimError: If the channel fails to attach
-        """
-        comm_channel = self._channel_cache[descriptor]
-        if comm_channel is not None:
-            return comm_channel
-
-        if self._channel_factory is None:
-            raise SmartSimError("No channel factory provided for consumers")
-
-        try:
-            channel = self._channel_factory(descriptor)
-            self._channel_cache[descriptor] = channel
-            return channel
-        except Exception as ex:
-            msg = f"Unable to construct channel with descriptor: {descriptor}"
-            logger.error(msg, exc_info=True)
-            raise SmartSimError(msg) from ex
-
-    def _get_next_event(self) -> t.Optional[EventBase]:
-        """Pop the next event to be sent from the queue.
-
-        :returns: The next event to send if any events are enqueued, otherwise `None`.
-        """
-        try:
-            return self._event_buffer.popleft()
-        except IndexError:
-            logger.debug(f"Broadcast buffer exhausted for {self._uid}")
-
-        return None
-
-    def _broadcast(self, timeout: float = 0.001) -> int:
-        """Broadcasts all buffered events to registered event consumers.
-
-        :param timeout: Maximum time to wait (in seconds) for messages to send
-        :returns: The number of events broadcasted to consumers
-        :raises SmartSimError: If the channel fails to attach
-        :raises SmartSimError: If broadcasting fails
-        """
-        # allow descriptors to be empty since events are buffered
-        self._descriptors = set(x for x in self._backbone.notification_channels if x)
-        if not self._descriptors:
-            logger.warning(f"No event consumers are registered for {self._uid}")
-            return 0
-
-        self._prune_unused_consumers()
-        self._log_broadcast_start()
-
-        num_sent = 0
-        num_listeners = len(self._descriptors)
-
-        # send each event to every consumer
-        while event := self._get_next_event():
-            logger.debug(f"Broadcasting {event=} to {num_listeners} listeners")
-            event_bytes = bytes(event)
-
-            for i, descriptor in enumerate(self._descriptors):
-                comm_channel = self._get_comm_channel(descriptor)
-
-                try:
-                    comm_channel.send(event_bytes, timeout)
-                    num_sent += 1
-                except Exception as ex:
-                    raise SmartSimError(
-                        f"Broadcast {i+1}/{num_listeners} for event {event.uid} to "
-                        f"channel  {descriptor} from {self._uid} failed."
-                    ) from ex
-
-        return num_sent
-
-    def send(self, event: EventBase, timeout: float = 0.001) -> int:
-        """Implementation of `send` method of the `EventPublisher` protocol. Publishes
-        the supplied event to all registered broadcast consumers.
-
-        :param event: An event to publish
-        :param timeout: Maximum time to wait (in seconds) for messages to send
-        :returns: The number of events successfully published
-        :raises ValueError: If event serialization fails
-        :raises AttributeError: If event cannot be serialized
-        :raises KeyError: If channel fails to attach using registered descriptors
-        :raises SmartSimError: If any unexpected error occurs during send
-        """
-        try:
-            self._save_to_buffer(event)
-            return self._broadcast(timeout)
-        except (KeyError, ValueError, AttributeError, SmartSimError):
-            raise
-        except Exception as ex:
-            raise SmartSimError("An unexpected failure occurred while sending") from ex
-
-
-class EventConsumer:
-    """Reads system events published to a communications channel."""
-
-    _BACKBONE_WAIT_TIMEOUT = 10.0
-    """Maximum time (in seconds) to wait for the backbone to register the consumer"""
-
-    def __init__(
-        self,
-        comm_channel: CommChannelBase,
-        # channel_factory: ...,
-        backbone: BackboneFeatureStore,
-        filters: t.Optional[t.List[EventCategory]] = None,
-        name: t.Optional[str] = None,
-        event_handler: t.Optional[t.Callable[[EventBase], None]] = None,
-    ) -> None:
-        """Initialize the EventConsumer instance.
-
-        :param comm_channel: Communications channel to listen to for events
-        :param backbone: The MLI backbone feature store
-        :param filters: A list of event types to deliver. when empty, all
-        events will be delivered
-        :param name: A user-friendly name for logging. If not provided, an
-        auto-generated GUID will be used
-        :raises ValueError: If batch_timeout <= 0
-        """
-        self._comm_channel = comm_channel
-        """The comm channel used by the consumer to receive messages. The channel
-        descriptor will be published for senders to discover."""
-        self._backbone = backbone
-        """The backbone instance used to bootstrap the instance. The EventConsumer
-        uses the backbone to discover where it can publish its descriptor."""
-        self._global_filters = filters or []
-        """A set of global filters to apply to incoming events. Global filters are
-        combined with per-call filters. Filters act as an allow-list."""
-        self._name = name
-        """User-friendly name assigned to a consumer for logging. Automatically
-        assigned if not provided."""
-        self._event_handler = event_handler
-        """The function that should be executed when an event
-        passed by the filters is received."""
-        self.listening = True
-        """Flag indicating that the consumer is currently listening for new
-        events. Setting this flag to `False` will cause any active calls to
-        `listen` to terminate."""
-
-    @property
-    def descriptor(self) -> str:
-        """The descriptor of the underlying comm channel.
-
-        :returns: The comm channel descriptor"""
-        return self._comm_channel.descriptor
-
-    @property
-    def name(self) -> str:
-        """The friendly name assigned to the consumer.
-
-        :returns: The consumer name if one is assigned, othewise a unique
-        id assigned by the system.
-        """
-        if self._name is None:
-            self._name = str(uuid.uuid4())
-        return self._name
-
-    def recv(
-        self,
-        filters: t.Optional[t.List[EventCategory]] = None,
-        timeout: float = 0.001,
-        batch_timeout: float = 1.0,
-    ) -> t.List[EventBase]:
-        """Receives available published event(s).
-
-        :param filters: Additional filters to add to the global filters configured
-        on the EventConsumer instance
-        :param timeout: Maximum time to wait for a single message to arrive
-        :param batch_timeout: Maximum time to wait for messages to arrive; allows
-        multiple batches to be retrieved in one call to `send`
-        :returns: A list of events that pass any configured filters
-        :raises ValueError: If a positive, non-zero value is not provided for the
-        timeout or batch_timeout.
-        """
-        if filters is None:
-            filters = []
-
-        if timeout is not None and timeout <= 0:
-            raise ValueError("request timeout must be a non-zero, positive value")
-
-        if batch_timeout is not None and batch_timeout <= 0:
-            raise ValueError("batch_timeout must be a non-zero, positive value")
-
-        filter_set = {*self._global_filters, *filters}
-        all_message_bytes: t.List[bytes] = []
-
-        # firehose as many messages as possible within the batch_timeout
-        start_at = time.time()
-        remaining = batch_timeout
-
-        batch_message_bytes = self._comm_channel.recv(timeout=timeout)
-        while batch_message_bytes:
-            # remove any empty messages that will fail to decode
-            all_message_bytes.extend(batch_message_bytes)
-            batch_message_bytes = []
-
-            # avoid getting stuck indefinitely waiting for the channel
-            elapsed = time.time() - start_at
-            remaining = batch_timeout - elapsed
-
-            if remaining > 0:
-                batch_message_bytes = self._comm_channel.recv(timeout=timeout)
-
-        events_received: t.List[EventBase] = []
-
-        # Timeout elapsed or no messages received - return the empty list
-        if not all_message_bytes:
-            return events_received
-
-        for message in all_message_bytes:
-            if not message or message is None:
-                continue
-
-            event = pickle.loads(message)
-            if not event:
-                logger.warning("Unable to unpickle message")
-
-            # skip events that don't pass a filter
-            if filter_set and event.category not in filter_set:
-                continue
-
-            events_received.append(event)
-
-        return events_received
-
-    def _send_to_registrar(self, event: EventBase) -> None:
-        """Send an event direct to the registrar listener."""
-        registrar_key = BackboneFeatureStore.MLI_BACKEND_CONSUMER
-        config = self._backbone.wait_for([registrar_key], self._BACKBONE_WAIT_TIMEOUT)
-        registrar_descriptor = str(config.get(registrar_key, None))
-
-        if not registrar_descriptor:
-            logger.warning(f"Unable to {event.category}. No registrar channel found.")
-            return
-
-        logger.debug(f"Sending {event.category} for {self.name}")
-
-        registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor)
-        registrar_channel.send(bytes(event), timeout=1.0)
-
-        logger.debug(f"{event.category} for {self.name} sent")
-
-    def register(self) -> None:
-        """Send an event to register this consumer as a listener."""
-        descriptor = self._comm_channel.descriptor
-        event = OnCreateConsumer(descriptor, self._global_filters)
-
-        self._send_to_registrar(event)
-
-    def unregister(self) -> None:
-        """Send an event to un-register this consumer as a listener."""
-        descriptor = self._comm_channel.descriptor
-        event = OnRemoveConsumer(descriptor)
-
-        self._send_to_registrar(event)
-
-    @staticmethod
-    def _on_handler_missing(event: EventBase) -> None:
-        """A "dead letter" event handler that is called to perform
-        processing on events before they're discarded.
-
-        :param event: The event to handle
-        """
-        logger.warning(f"No event handler is registered. Discarding {event=}")
-
-    def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None:
-        """Receives messages for the consumer a single time. Delivers
-        all messages that pass the consumer filters. Shutdown requests
-        are handled by a default event handler.
-
-
-        NOTE: Executes a single batch-retrieval to receive the maximum
-        number of messages available under batch timeout. To continually
-        listen, use `listen` in a non-blocking thread/process
-
-        :param timeout: Maximum time to wait (in seconds) for a message to arrive
-        :param timeout: Maximum time to wait (in seconds) for a batch to arrive
-        """
-        logger.debug(f"Starting event listener with {timeout} second timeout")
-        logger.debug("Awaiting new messages")
-
-        if not self._event_handler:
-            logger.debug("Unable to handle messages. No event handler is registered.")
-
-        incoming_messages = self.recv(timeout=timeout, batch_timeout=batch_timeout)
-
-        if not incoming_messages:
-            logger.debug(f"Consumer {self.name} received empty message list.")
-
-        for message in incoming_messages:
-            logger.debug(f"Sending event {message=} to handler.")
-            self._handle_shutdown(message)
-
-            if self._event_handler:
-                self._event_handler(message)
-            else:
-                self._on_handler_missing(message)
-
-    def _handle_shutdown(self, event: EventBase) -> bool:
-        """Handles shutdown requests sent to the consumer by setting the
-        `self.listener` property to `False`.
-
-        :param event: The event to handle
-        :returns: A bool indicating if the event was a shutdown request
-        """
-        if isinstance(event, OnShutdownRequested):
-            self.listening = False
-            return True
-        return False
-
-    def listen(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None:
-        """Receives messages for the consumer until a shutdown request is received.
-
-        :param timeout: Maximum time to wait (in seconds) for a message to arrive
-        :param batch_timeout: Maximum time to wait (in seconds) for a batch to arrive
-        """
-
-        while self.listening:
-            self.listen_once(timeout, batch_timeout)
diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
index dc0f57ae6..24f2221c8 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py
@@ -51,13 +51,19 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None:
 
         :param storage: A distributed dictionary to be used as the underlying
         storage mechanism of the feature store"""
+        if storage is None:
+            raise ValueError(
+                "Storage is required when instantiating a DragonFeatureStore."
+            )
+
+        descriptor = ""
         if isinstance(storage, dragon_ddict.DDict):
             descriptor = ddict_to_descriptor(storage)
-        else:
-            descriptor = "not-set"
 
         super().__init__(descriptor)
         self._storage: t.Dict[str, t.Union[str, bytes]] = storage
+        """The underlying storage mechanism of the DragonFeatureStore; a
+        distributed, in-memory key-value store"""
 
     def _get(self, key: str) -> t.Union[str, bytes]:
         """Retrieve a value from the underlying storage mechanism.
diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_util.py b/smartsim/_core/mli/infrastructure/storage/dragon_util.py
index fda89bba5..50d15664c 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragon_util.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragon_util.py
@@ -40,6 +40,7 @@ def ddict_to_descriptor(ddict: dragon_ddict.DDict) -> str:
 
     :param ddict: The dragon dictionary to convert
     :returns: The descriptor string
+    :raises ValueError: If a ddict is not provided
     """
     if ddict is None:
         raise ValueError("DDict is not available to create a descriptor")
diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py
index 260b1a337..ebca07ed4 100644
--- a/smartsim/_core/mli/infrastructure/storage/feature_store.py
+++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py
@@ -43,7 +43,7 @@ class ReservedKeys(str, enum.Enum):
     """Storage location for the list of registered consumers that will receive
     events from an EventBroadcaster"""
 
-    MLI_BACKEND_CONSUMER = "_SMARTIM_MLI_BACKEND_CONSUMER"
+    MLI_REGISTRAR_CONSUMER = "_SMARTIM_MLI_REGISTRAR_CONSUMER"
     """Storage location for the channel used to send messages directly to
     the MLI backend"""
 
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index ac1a14866..018703271 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -111,7 +111,7 @@ def has_model_key(self) -> bool:
 
     @property
     def has_raw_inputs(self) -> bool:
-        """Check if the InferenceRequest contains raw_outputs.
+        """Check if the InferenceRequest contains raw_inputs.
 
         :returns: True if raw_outputs is not None and is not an empty list,
         False otherwise
diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
index 114db88d9..f99950739 100644
--- a/smartsim/_core/utils/timings.py
+++ b/smartsim/_core/utils/timings.py
@@ -145,10 +145,12 @@ def max_length(self) -> int:
         return max(len(value) for value in self._timings.values())
 
     def print_timings(self, to_file: bool = False) -> None:
-        """Print all timing information
+        """Print timing information to standard output. If `to_file`
+        is `True`, also write results to a file.
 
-        :param to_file: flag indicating if timing should be written to stdout
-        or to the timing file"""
+        :param to_file: If `True`, also saves timing information
+         to the files `timings.npy` and `timings.txt`
+        """
         print(" ".join(self._timings.keys()))
         try:
             value_array = np.array(list(self._timings.values()), dtype=float)
diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py
index d9cdcf594..46598a817 100644
--- a/smartsim/protoclient.py
+++ b/smartsim/protoclient.py
@@ -51,11 +51,10 @@
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster
+from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
-    EventBroadcaster,
-    EventProducer,
-    OnWriteFeatureStore,
 )
 from smartsim._core.mli.message_handler import MessageHandler
 from smartsim._core.utils.timings import PerfTimer
@@ -82,6 +81,10 @@ class ProtoClient:
     """A default number of events to be buffered in the work queue before
     triggering QueueFull exceptions."""
 
+    _EVENT_SOURCE = "proto-client"
+    """A user-friendly name for this class instance to identify 
+    the client as the publisher of an event."""
+
     @staticmethod
     def _attach_to_backbone() -> BackboneFeatureStore:
         """Use the supplied environment variables to attach
@@ -90,6 +93,8 @@ def _attach_to_backbone() -> BackboneFeatureStore:
         environment variable.
 
         :returns: The attached backbone featurestore
+        :raises SmartSimError: If the backbone descriptor is not contained
+         in the appropriate environment variable
         """
         descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None)
         if descriptor is None or not descriptor:
@@ -128,11 +133,11 @@ def _attach_to_worker_queue(self) -> DragonFLIChannel:
 
         return DragonFLIChannel.from_descriptor(descriptor)
 
-    def _create_broadcaster(self) -> EventProducer:
-        """Create an event publisher that will broadcast updates to
-        other MLI components. This publisher
+    def _create_broadcaster(self) -> EventBroadcaster:
+        """Create an EventBroadcaster that broadcasts events to
+        all MLI components registered to consume them.
 
-        :returns: the event publisher instance
+        :returns: An EventBroadcaster instance
         """
         broadcaster = EventBroadcaster(
             self._backbone, DragonCommChannel.from_descriptor
@@ -147,10 +152,11 @@ def __init__(
         """Initialize the client instance.
 
         :param timing_on: Flag indicating if timing information should be
-        written to file
-        :param wait_timeout: Maximum wait time (in seconds) allowed to attach to the
-        worker queue
+         written to file
+        :param backbone_timeout: Maximum wait time (in seconds) allowed to attach to the
+         worker queue
         :raises SmartSimError: If unable to attach to a backbone featurestore
+        :raises ValueError: If an invalid backbone timeout is specified
         """
         if MPI is not None:
             # TODO: determine a way to make MPI work in the test environment
@@ -215,8 +221,8 @@ def _format_number(number: t.Union[numbers.Number, float]) -> str:
     def start_timings(self, batch_size: numbers.Number) -> None:
         """Configure the client to begin storing timing information.
 
-        :param bach_size: The size of batches to generate as inputs
-        to the model
+        :param batch_size: The size of batches to generate as inputs
+         to the model
         """
         if self._timing_on:
             self._add_label_to_timings("batch_size")
@@ -245,10 +251,11 @@ def measure_time(self, label: str) -> None:
             self._interm = time.perf_counter()
 
     def print_timings(self, to_file: bool = False) -> None:
-        """Print timing information to standard output.
+        """Print timing information to standard output. If `to_file`
+        is `True`, also write results to a file.
 
         :param to_file: If `True`, also saves timing information
-        to the files `timings.npy` and `timings.txt`
+         to the files `timings.npy` and `timings.txt`
         """
         print(" ".join(self._timings.keys()))
 
@@ -261,7 +268,7 @@ def print_timings(self, to_file: bool = False) -> None:
             numpy.savetxt("timings.txt", value_array)
 
     def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any:
-        """Execute a bach of inference requests with the supplied ML model.
+        """Execute a batch of inference requests with the supplied ML model.
 
         :param model: The raw bytes or path to a pytorch model
         :param batch: The tensor batch to perform inference on
@@ -305,7 +312,6 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any:
             self.perf_timer.measure_time("send_request")
             for tensor in tensors:
                 to_sendh.send_bytes(tensor.tobytes())  # TODO NOT FAST ENOUGH!!!
-                # to_sendh.send_bytes(bytes(tensor.data))
         logger.info(f"Message size: {len(request_bytes)} bytes")
 
         self.perf_timer.measure_time("send_tensors")
@@ -314,7 +320,7 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any:
             self.perf_timer.measure_time("receive_response")
             response = MessageHandler.deserialize_response(resp)
             self.perf_timer.measure_time("deserialize_response")
-            # list of data blobs?
+
             # recv depending on the len(response.result.descriptors)?
             data_blob: bytes = from_recvh.recv_bytes(timeout=None)
             self.perf_timer.measure_time("receive_tensor")
@@ -338,5 +344,5 @@ def set_model(self, key: str, model: bytes) -> None:
         self._backbone[key] = model
 
         # notify components of a change in the data at this key
-        event = OnWriteFeatureStore(self._backbone.descriptor, key)
+        event = OnWriteFeatureStore(self._EVENT_SOURCE, self._backbone.descriptor, key)
         self._publisher.send(event)
diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py
index 229855bc5..2b2ef50f9 100644
--- a/tests/dragon/test_dragon_backend.py
+++ b/tests/dragon/test_dragon_backend.py
@@ -35,13 +35,15 @@
 
 from smartsim._core.launcher.dragon.dragonBackend import DragonBackend
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
-from smartsim._core.mli.infrastructure.control.event_listener import (
+from smartsim._core.mli.infrastructure.comm.event import (
+    OnCreateConsumer,
+    OnShutdownRequested,
+)
+from smartsim._core.mli.infrastructure.control.listener import (
     ConsumerRegistrationListener,
 )
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
-    OnCreateConsumer,
-    OnShutdownRequested,
 )
 from smartsim.log import get_logger
 
@@ -50,25 +52,30 @@
 logger = get_logger(__name__)
 
 
-def test_dragonbackend_start_listener():
+@pytest.fixture(scope="module")
+def the_backend() -> DragonBackend:
+    return DragonBackend(pid=9999)
+
+
+def test_dragonbackend_start_listener(the_backend: DragonBackend):
     """Verify the background process listening to consumer registration events
     is up and processing messages as expected."""
-    backend = DragonBackend(pid=9999)
 
     # We need to let the backend create the backbone to continue
-    backbone = backend._create_backbone()
-    backbone.pop(BackboneFeatureStore.MLI_BACKEND_CONSUMER)
+    backbone = the_backend._create_backbone()
+    backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS)
+    backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
 
     os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor
 
     with pytest.raises(KeyError) as ex:
         # we expect the value of the consumer to be empty until
         # the listener start-up completes.
-        backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+        backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
 
     assert "not found" in ex.value.args[0]
 
-    drg_process = backend.start_event_listener(cpu_affinity=[], gpu_affinity=[])
+    drg_process = the_backend.start_event_listener(cpu_affinity=[], gpu_affinity=[])
 
     # # confirm there is a process still running
     logger.info(f"Dragon process started: {drg_process}")
@@ -79,24 +86,24 @@ def test_dragonbackend_start_listener():
     # wait for the event listener to come up
     try:
         config = backbone.wait_for(
-            [BackboneFeatureStore.MLI_BACKEND_CONSUMER], timeout=30
+            [BackboneFeatureStore.MLI_REGISTRAR_CONSUMER], timeout=30
         )
         # verify result was in the returned configuration map
-        assert config[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+        assert config[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
     except Exception:
         raise KeyError(
-            f"Unable to locate {BackboneFeatureStore.MLI_BACKEND_CONSUMER}"
+            f"Unable to locate {BackboneFeatureStore.MLI_REGISTRAR_CONSUMER}"
             "in the backbone"
         )
 
     # wait_for ensures the normal retrieval will now work, error-free
-    descriptor = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+    descriptor = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
     assert descriptor is not None
 
     # register a new listener channel
     comm_channel = DragonCommChannel.from_descriptor(descriptor)
     mock_descriptor = str(uuid.uuid4())
-    event = OnCreateConsumer(mock_descriptor, [])
+    event = OnCreateConsumer("test_dragonbackend_start_listener", mock_descriptor, [])
 
     event_bytes = bytes(event)
     comm_channel.send(event_bytes)
@@ -122,17 +129,19 @@ def test_dragonbackend_start_listener():
         drg_process.join()
 
 
-def test_dragonbackend_backend_consumer():
-    """Verify the listener background process updates the MLI_BACKEND_CONSUMER
+def test_dragonbackend_backend_consumer(the_backend: DragonBackend):
+    """Verify the listener background process updates the appropriate
     value in the backbone."""
-    backend = DragonBackend(pid=9999)
 
     # We need to let the backend create the backbone to continue
-    backbone = backend._create_backbone()
+    backbone = the_backend._create_backbone()
+    backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS)
+    backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
+
     assert backbone._allow_reserved_writes
 
     # create listener with `as_service=False` to perform a single loop iteration
-    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False)
+    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=False)
 
     logger.debug(f"backbone loaded? {listener._backbone}")
     logger.debug(f"listener created? {listener}")
@@ -142,8 +151,8 @@ def test_dragonbackend_backend_consumer():
         # the entire service lifecycle
         listener.execute()
 
-        consumer_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
-        logger.debug(f"MLI_BACKEND_CONSUMER: {consumer_desc}")
+        consumer_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
+        logger.debug(f"MLI_REGISTRAR_CONSUMER: {consumer_desc}")
 
         assert consumer_desc
     except Exception as ex:
@@ -152,17 +161,17 @@ def test_dragonbackend_backend_consumer():
         listener._on_shutdown()
 
 
-def test_dragonbackend_event_handled():
-    """Verify the event listener process updates the MLI_NOTIFY_CONSUMERS
+def test_dragonbackend_event_handled(the_backend: DragonBackend):
+    """Verify the event listener process updates the appropriate
     value in the backbone when an event is received and again on shutdown.
     """
-    backend = DragonBackend(pid=9999)
-
     # We need to let the backend create the backbone to continue
-    backbone = backend._create_backbone()
+    backbone = the_backend._create_backbone()
+    backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS)
+    backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
 
     # create the listener to be tested
-    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False)
+    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=False)
 
     assert listener._backbone, "The listener is not attached to a backbone"
 
@@ -171,14 +180,18 @@ def test_dragonbackend_event_handled():
         listener._create_eventing()  # listener.execute()
 
         # grab the channel descriptor so we can simulate registrations
-        channel_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
+        channel_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
         comm_channel = DragonCommChannel.from_descriptor(channel_desc)
 
         num_events = 5
         events = []
         for i in range(num_events):
             # register some mock consumers using the backend channel
-            event = OnCreateConsumer(f"mock-consumer-descriptor-{uuid.uuid4()}", [])
+            event = OnCreateConsumer(
+                "test_dragonbackend_event_handled",
+                f"mock-consumer-descriptor-{uuid.uuid4()}",
+                [],
+            )
             event_bytes = bytes(event)
             comm_channel.send(event_bytes)
             events.append(event)
@@ -198,110 +211,97 @@ def test_dragonbackend_event_handled():
 
     except Exception as ex:
         logger.exception(f"test_dragonbackend_event_handled - exception occurred: {ex}")
+        assert False
     finally:
         # shutdown should unregister a registration listener
         listener._on_shutdown()
 
     for i in range(10):
-        if BackboneFeatureStore.MLI_BACKEND_CONSUMER not in backbone:
+        if BackboneFeatureStore.MLI_REGISTRAR_CONSUMER not in backbone:
             logger.debug(f"The listener was removed after {i} iterations")
             channel_desc = None
             break
 
     # we should see that there is no listener registered
-    assert not channel_desc
+    assert not channel_desc, "Listener shutdown failed to clean up the backbone"
 
 
-def test_dragonbackend_shutdown_event():
+def test_dragonbackend_shutdown_event(the_backend: DragonBackend):
     """Verify the background process shuts down when it receives a
     shutdown request."""
-    backend = DragonBackend(pid=9999)
 
     # We need to let the backend create the backbone to continue
-    backbone = backend._create_backbone()
+    backbone = the_backend._create_backbone()
+    backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS)
+    backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
 
-    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False)
+    listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=True)
 
-    logger.debug(f"backbone loaded? {listener._backbone}")
-    logger.debug(f"listener created? {listener}")
+    # set up the listener but don't let the listener loop start
+    listener._create_eventing()  # listener.execute()
 
-    try:
-        # set up the listener but don't let the listener loop start
-        listener._create_eventing()  # listener.execute()
+    # grab the channel descriptor so we can publish to it
+    channel_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER]
+    comm_channel = DragonCommChannel.from_descriptor(channel_desc)
 
-        # grab the channel descriptor so we can publish to it
-        channel_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER]
-        comm_channel = DragonCommChannel.from_descriptor(channel_desc)
+    assert listener._consumer.listening, "Listener isn't ready to listen"
 
-        assert listener._consumer.listening, "Listener wasn't ready to listen"
-
-        # send a shutdown request...
-        event = OnShutdownRequested()
-        event_bytes = bytes(event)
-        comm_channel.send(event_bytes)
-
-        # run iteration a few times in case it takes a few cycles to write
-        for _ in range(5):
-            listener._on_iteration()
-
-        logger.info(f"{listener._consumer.listening=}")
+    # send a shutdown request...
+    event = OnShutdownRequested("test_dragonbackend_shutdown_event")
+    event_bytes = bytes(event)
+    comm_channel.send(event_bytes, 0.1)
 
-        # ...and confirm the listener is now cancelled
-        assert not listener._consumer.listening
+    # execute should encounter the shutdown and exit
+    listener.execute()
 
-    except Exception as ex:
-        logger.exception(
-            f"test_dragonbackend_shutdown_event - exception occurred: {ex}"
-        )
+    # ...and confirm the listener is now cancelled
+    assert not listener._consumer.listening
 
 
 @pytest.mark.parametrize("health_check_frequency", [10, 20])
-def test_dragonbackend_shutdown_on_health_check(health_check_frequency: float):
+def test_dragonbackend_shutdown_on_health_check(
+    the_backend: DragonBackend,
+    health_check_frequency: float,
+):
     """Verify that the event listener automatically shuts down when
     a new listener is registered in its place.
 
     :param health_check_frequency: The expected frequency of service health check
      invocations"""
-    backend = DragonBackend(pid=9999)
 
     # We need to let the backend create the backbone to continue
-    backbone = backend._create_backbone()
+    backbone = the_backend._create_backbone()
+    backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS)
+    backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER)
 
     listener = ConsumerRegistrationListener(
         backbone,
         1.0,
         1.0,
-        [],
         as_service=True,  # allow service to run long enough to health check
         health_check_frequency=health_check_frequency,
     )
 
-    try:
-        # set up the listener but don't let the listener loop start
-        listener._create_eventing()  # listener.execute()
-        assert listener._consumer.listening, "Listener wasn't ready to listen"
-
-        # Replace the consumer descriptor in the backbone to trigger
-        # an automatic shutdown
-        backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = str(uuid.uuid4())
+    # set up the listener but don't let the listener loop start
+    listener._create_eventing()  # listener.execute()
+    assert listener._consumer.listening, "Listener wasn't ready to listen"
 
-        # set the last health check manually to verify the duration
-        start_at = time.time()
-        listener._last_health_check = time.time()
+    # Replace the consumer descriptor in the backbone to trigger
+    # an automatic shutdown
+    backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = str(uuid.uuid4())
 
-        # run execute to let the service trigger health checks
-        listener.execute()
-        elapsed = time.time() - start_at
+    # set the last health check manually to verify the duration
+    start_at = time.time()
+    listener._last_health_check = time.time()
 
-        # confirm the frequency of the health check was honored
-        assert elapsed >= health_check_frequency
+    # run execute to let the service trigger health checks
+    listener.execute()
+    elapsed = time.time() - start_at
 
-        # ...and confirm the listener is now cancelled
-        assert (
-            not listener._consumer.listening
-        ), "Listener was not automatically shutdown by the health check"
+    # confirm the frequency of the health check was honored
+    assert elapsed >= health_check_frequency
 
-    except Exception as ex:
-        logger.exception(
-            f"test_dragonbackend_shutdown_event - exception occurred: {ex}"
-        )
+    # ...and confirm the listener is now cancelled
+    assert (
+        not listener._consumer.listening
+    ), "Listener was not automatically shutdown by the health check"
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index 9dd0255fe..e9f6004d1 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -36,7 +36,7 @@
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader
-from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     DragonFeatureStore,
 )
 from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
@@ -64,7 +64,8 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke
     chan = create_local()
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv(
-        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+        EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR,
+        du.B64.bytes_to_str(queue.serialize()),
     )
 
     config = EnvironmentConfigLoader(
@@ -87,7 +88,8 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
     chan = create_local()
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv(
-        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+        EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR,
+        du.B64.bytes_to_str(queue.serialize()),
     )
 
     config = EnvironmentConfigLoader(
@@ -102,7 +104,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
 def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
     """An incorrect serialized descriptor will fails to attach."""
 
-    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring")
+    monkeypatch.setenv(EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, "randomstring")
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -120,7 +122,9 @@ def test_environment_loader_backbone_load_dfs(
     """Verify the dragon feature store is loaded correctly by the
     EnvironmentConfigLoader to demonstrate featurestore_factory correctness."""
     feature_store = DragonFeatureStore(the_storage)
-    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor)
+    monkeypatch.setenv(
+        EnvironmentConfigLoader.BACKBONE_ENV_VAR, feature_store.descriptor
+    )
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -138,8 +142,8 @@ def test_environment_variables_not_set(monkeypatch: pytest.MonkeyPatch):
     """EnvironmentConfigLoader getters return None when environment
     variables are not set."""
     with monkeypatch.context() as patch:
-        patch.setenv("_SMARTSIM_INFRA_BACKBONE", "")
-        patch.setenv("_SMARTSIM_REQUEST_QUEUE", "")
+        patch.setenv(EnvironmentConfigLoader.BACKBONE_ENV_VAR, "")
+        patch.setenv(EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, "")
 
         config = EnvironmentConfigLoader(
             featurestore_factory=DragonFeatureStore.from_descriptor,
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index df370cbc4..a7ba7e7f2 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -523,3 +523,14 @@ def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch) -> None:
 
     mock_reply_fn.assert_called_once()
     mock_reply_fn.assert_called_with("fail", "Failure while fetching the model.")
+
+
+def test_dragon_feature_store_invalid_storage():
+    """Verify that attempting to create a DragonFeatureStore without storage fails."""
+    storage = None
+
+    with pytest.raises(ValueError) as ex:
+        DragonFeatureStore(storage)
+
+    assert "storage" in ex.value.args[0].lower()
+    assert "required" in ex.value.args[0].lower()
diff --git a/tests/dragon/test_event_consumer.py b/tests/dragon/test_event_consumer.py
index f361e6c16..bda8f33cd 100644
--- a/tests/dragon/test_event_consumer.py
+++ b/tests/dragon/test_event_consumer.py
@@ -35,22 +35,18 @@
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.comm.channel.dragon_util import create_local
-from smartsim._core.mli.infrastructure.control.event_listener import (
-    ConsumerRegistrationListener,
-)
-from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
-    BackboneFeatureStore,
-    EventBase,
-    EventBroadcaster,
-    EventCategory,
-    EventConsumer,
+from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster
+from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer
+from smartsim._core.mli.infrastructure.comm.event import (
     OnCreateConsumer,
-    OnRemoveConsumer,
     OnShutdownRequested,
     OnWriteFeatureStore,
 )
+from smartsim._core.mli.infrastructure.control.listener import (
+    ConsumerRegistrationListener,
+)
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
-    time as bbtime,
+    BackboneFeatureStore,
 )
 from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim.log import get_logger
@@ -129,7 +125,7 @@ def test_eventconsumer_eventpublisher_integration(
     wmgr_consumer = EventConsumer(
         wmgr_channel,
         the_backbone,
-        filters=[EventCategory.FEATURE_STORE_WRITTEN],
+        filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN],
     )
     capp_consumer = EventConsumer(
         capp_channel,
@@ -138,7 +134,7 @@ def test_eventconsumer_eventpublisher_integration(
     back_consumer = EventConsumer(
         back_channel,
         the_backbone,
-        filters=[EventCategory.CONSUMER_CREATED],
+        filters=[OnCreateConsumer.CONSUMER_CREATED],
     )
 
     # create some broadcasters to publish messages
@@ -160,12 +156,20 @@ def test_eventconsumer_eventpublisher_integration(
     ]
 
     # simulate worker manager sending a notification to backend that it's alive
-    event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[])
+    event_1 = OnCreateConsumer(
+        "test_eventconsumer_eventpublisher_integration",
+        wmgr_consumer_descriptor,
+        filters=[],
+    )
     mock_worker_mgr.send(event_1)
 
     # simulate the app updating a model a few times
     for key in ["key-1", "key-2", "key-1"]:
-        event = OnWriteFeatureStore(the_backbone.descriptor, key)
+        event = OnWriteFeatureStore(
+            "test_eventconsumer_eventpublisher_integration",
+            the_backbone.descriptor,
+            key,
+        )
         mock_client_app.send(event, timeout=0.1)
 
     # worker manager should only get updates about feature update
@@ -209,7 +213,7 @@ def test_eventconsumer_invalid_timeout(
     wmgr_consumer = EventConsumer(
         wmgr_channel,
         the_backbone,
-        filters=[EventCategory.FEATURE_STORE_WRITTEN],
+        filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN],
     )
 
     # the consumer should report an error for the invalid timeout value
@@ -246,7 +250,11 @@ def test_eventconsumer_no_event_handler_registered(
 
     # simulate the app updating a model a few times
     for key in ["key-1", "key-2", "key-1"]:
-        event = OnWriteFeatureStore(the_backbone.descriptor, key)
+        event = OnWriteFeatureStore(
+            "test_eventconsumer_no_event_handler_registered",
+            the_backbone.descriptor,
+            key,
+        )
         mock_worker_mgr.send(event, timeout=0.1)
 
     # run the handler and let it discard messages
@@ -287,10 +295,16 @@ def test_eventconsumer_no_event_handler_registered_shutdown(
 
     # simulate the app updating a model a few times
     for key in ["key-1", "key-2", "key-1"]:
-        event = OnWriteFeatureStore(the_backbone.descriptor, key)
+        event = OnWriteFeatureStore(
+            "test_eventconsumer_no_event_handler_registered_shutdown",
+            the_backbone.descriptor,
+            key,
+        )
         mock_worker_mgr.send(event, timeout=0.1)
 
-    event = OnShutdownRequested()
+    event = OnShutdownRequested(
+        "test_eventconsumer_no_event_handler_registered_shutdown"
+    )
     mock_worker_mgr.send(event, timeout=0.1)
 
     # wmgr will stop listening to messages when it is told to stop listening
@@ -389,19 +403,19 @@ def test_registrar_teardown(
         registrar._create_eventing()
 
         # confirm the registrar is published to the backbone
-        cfg = the_backbone.wait_for([BackboneFeatureStore.MLI_BACKEND_CONSUMER], 10)
-        assert BackboneFeatureStore.MLI_BACKEND_CONSUMER in cfg
+        cfg = the_backbone.wait_for([BackboneFeatureStore.MLI_REGISTRAR_CONSUMER], 10)
+        assert BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in cfg
 
         # execute the entire service lifecycle 1x
         registrar.execute()
 
-        consumer_found = BackboneFeatureStore.MLI_BACKEND_CONSUMER in the_backbone
+        consumer_found = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in the_backbone
 
         for i in range(15):
             time.sleep(0.1)
-            consumer_found = BackboneFeatureStore.MLI_BACKEND_CONSUMER in the_backbone
+            consumer_found = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in the_backbone
             if not consumer_found:
                 logger.debug(f"Registrar removed from the backbone on iteration {i}")
                 break
 
-        assert BackboneFeatureStore.MLI_BACKEND_CONSUMER not in the_backbone
+        assert BackboneFeatureStore.MLI_REGISTRAR_CONSUMER not in the_backbone
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index e34120c98..9156979ed 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -36,16 +36,10 @@
 
 dragon = pytest.importorskip("dragon")
 
-from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
-    EventBroadcaster,
-    EventCategory,
-    EventConsumer,
-    OnCreateConsumer,
-    OnWriteFeatureStore,
 )
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     time as bbtime,
diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py
index 2278a0036..4cadfd8f3 100644
--- a/tests/dragon/test_featurestore_base.py
+++ b/tests/dragon/test_featurestore_base.py
@@ -31,14 +31,15 @@
 
 dragon = pytest.importorskip("dragon")
 
-from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
-    BackboneFeatureStore,
-    EventBroadcaster,
-    EventCategory,
-    EventConsumer,
+from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster
+from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer
+from smartsim._core.mli.infrastructure.comm.event import (
     OnCreateConsumer,
     OnWriteFeatureStore,
 )
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
@@ -55,9 +56,6 @@
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
 
-WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE"
-RANDOMLY_SET_KEY = "_SOMETHING_ELSE"
-
 
 def boom(*args, **kwargs) -> None:
     """Helper function that blows up when used to mock up
@@ -72,8 +70,8 @@ def test_event_uid() -> None:
 
     # generate a bunch of events and keep track all the IDs
     for i in range(num_iters):
-        event_a = OnCreateConsumer(str(i), filters=[])
-        event_b = OnWriteFeatureStore(str(i), "key")
+        event_a = OnCreateConsumer("test_event_uid", str(i), filters=[])
+        event_b = OnWriteFeatureStore("test_event_uid", str(i), filters=[])
 
         uids.add(event_a.uid)
         uids.add(event_b.uid)
@@ -186,7 +184,9 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None:
     # NOTE: we're not putting any consumers into the backbone here!
     backbone = BackboneFeatureStore(mock_storage)
 
-    event = OnCreateConsumer(consumer_descriptor, filters=[])
+    event = OnCreateConsumer(
+        "test_eventpublisher_broadcast_no_factory", consumer_descriptor, filters=[]
+    )
 
     publisher = EventBroadcaster(backbone)
     num_receivers = 0
@@ -194,7 +194,9 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None:
     # publishing this event without any known consumers registered should succeed
     # but report that it didn't have anybody to send the event to
     consumer_descriptor = storage_path / f"test-consumer"
-    event = OnCreateConsumer(consumer_descriptor, filters=[])
+    event = OnCreateConsumer(
+        "test_eventpublisher_broadcast_no_factory", consumer_descriptor, filters=[]
+    )
 
     num_receivers += publisher.send(event)
 
@@ -225,7 +227,11 @@ def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None:
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
     backbone.notification_channels = []
 
-    event = OnCreateConsumer(consumer_descriptor, filters=[])
+    event = OnCreateConsumer(
+        "test_eventpublisher_broadcast_to_empty_consumer_list",
+        consumer_descriptor,
+        filters=[],
+    )
     publisher = EventBroadcaster(
         backbone, channel_factory=FileSystemCommChannel.from_descriptor
     )
@@ -258,7 +264,11 @@ def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None
     backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True)
     backbone.notification_channels = [consumer_descriptor]
 
-    event = OnCreateConsumer(consumer_descriptor, filters=[])
+    event = OnCreateConsumer(
+        "test_eventpublisher_broadcast_without_channel_factory",
+        consumer_descriptor,
+        filters=[],
+    )
     publisher = EventBroadcaster(
         backbone,
         # channel_factory=FileSystemCommChannel.from_descriptor # <--- not supplied
@@ -293,11 +303,17 @@ def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None:
     # mock building up some buffered events
     num_buffered_events = 14
     for i in range(num_buffered_events):
-        event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", [])
+        event = OnCreateConsumer(
+            "test_eventpublisher_broadcast_empties_buffer",
+            storage_path / f"test-consumer-{str(i)}",
+            [],
+        )
         publisher._event_buffer.append(bytes(event))
 
     event0 = OnCreateConsumer(
-        storage_path / f"test-consumer-{str(num_buffered_events + 1)}", []
+        "test_eventpublisher_broadcast_empties_buffer",
+        storage_path / f"test-consumer-{str(num_buffered_events + 1)}",
+        [],
     )
 
     num_receivers = publisher.send(event0)
@@ -344,13 +360,21 @@ def test_eventpublisher_broadcast_returns_total_sent(
 
     # mock building up some buffered events
     for i in range(num_buffered):
-        event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", [])
+        event = OnCreateConsumer(
+            "test_eventpublisher_broadcast_returns_total_sent",
+            storage_path / f"test-consumer-{str(i)}",
+            [],
+        )
         publisher._event_buffer.append(bytes(event))
 
     assert publisher.num_buffered == num_buffered
 
     # this event will trigger clearing anything already in buffer
-    event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}", [])
+    event0 = OnCreateConsumer(
+        "test_eventpublisher_broadcast_returns_total_sent",
+        storage_path / f"test-consumer-{num_buffered}",
+        [],
+    )
 
     # num_receivers should contain a number that computes w/all consumers and all events
     num_receivers = publisher.send(event0)
@@ -376,7 +400,11 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None:
         backbone, channel_factory=FileSystemCommChannel.from_descriptor
     )
 
-    event = OnCreateConsumer(consumer_descriptor, filters=[])
+    event = OnCreateConsumer(
+        "test_eventpublisher_prune_unused_consumer",
+        consumer_descriptor,
+        filters=[],
+    )
 
     # the only registered cnosumer is in the event, expect no pruning
     backbone.notification_channels = (consumer_descriptor,)
@@ -390,7 +418,9 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None:
     # ... and remove the old descriptor from the backbone when it's looked up
     backbone.notification_channels = (consumer_descriptor2,)
 
-    event = OnCreateConsumer(consumer_descriptor2, filters=[])
+    event = OnCreateConsumer(
+        "test_eventpublisher_prune_unused_consumer", consumer_descriptor2, filters=[]
+    )
 
     publisher.send(event)
 
@@ -447,7 +477,9 @@ def test_eventpublisher_serialize_failure(
     )
 
     with monkeypatch.context() as patch:
-        event = OnCreateConsumer(target_descriptor, filters=[])
+        event = OnCreateConsumer(
+            "test_eventpublisher_serialize_failure", target_descriptor, filters=[]
+        )
 
         # patch the __bytes__ implementation to cause pickling to fail during send
         def bad_bytes(self) -> bytes:
@@ -490,7 +522,9 @@ def boom(descriptor: str) -> None:
     publisher = EventBroadcaster(backbone, channel_factory=boom)
 
     with monkeypatch.context() as patch:
-        event = OnCreateConsumer(target_descriptor, filters=[])
+        event = OnCreateConsumer(
+            "test_eventpublisher_factory_failure", target_descriptor, filters=[]
+        )
 
         backbone.notification_channels = (target_descriptor,)
 
@@ -527,7 +561,9 @@ def boom(self) -> None:
         raise Exception("That was unexpected...")
 
     with monkeypatch.context() as patch:
-        event = OnCreateConsumer(target_descriptor, filters=[])
+        event = OnCreateConsumer(
+            "test_eventpublisher_failure", target_descriptor, filters=[]
+        )
 
         # patch the _broadcast implementation to cause send to fail after
         # after the event has been pickled
@@ -559,7 +595,9 @@ def test_eventconsumer_receive(test_dir: str) -> None:
 
     backbone = BackboneFeatureStore(mock_storage)
     comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor)
-    event = OnCreateConsumer(target_descriptor, filters=[])
+    event = OnCreateConsumer(
+        "test_eventconsumer_receive", target_descriptor, filters=[]
+    )
 
     # simulate a sent event by writing directly to the input comm channel
     comm_channel.send(bytes(event))
@@ -596,7 +634,9 @@ def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None:
 
     # simulate multiple sent events by writing directly to the input comm channel
     for _ in range(num_sent):
-        event = OnCreateConsumer(target_descriptor, filters=[])
+        event = OnCreateConsumer(
+            "test_eventconsumer_receive_multi", target_descriptor, filters=[]
+        )
         comm_channel.send(bytes(event))
 
     consumer = EventConsumer(comm_channel, backbone)
@@ -660,7 +700,7 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None:
     wmgr_consumer = EventConsumer(
         wmgr_channel,
         backbone,
-        filters=[EventCategory.FEATURE_STORE_WRITTEN],
+        filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN],
     )
     capp_consumer = EventConsumer(
         capp_channel,
@@ -669,7 +709,7 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None:
     back_consumer = EventConsumer(
         back_channel,
         backbone,
-        filters=[EventCategory.CONSUMER_CREATED],
+        filters=[OnCreateConsumer.CONSUMER_CREATED],
     )
 
     # create some broadcasters to publish messages
@@ -691,13 +731,23 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None:
     ]
 
     # simulate worker manager sending a notification to backend that it's alive
-    event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[])
+    event_1 = OnCreateConsumer(
+        "test_eventconsumer_eventpublisher_integration",
+        wmgr_consumer_descriptor,
+        filters=[],
+    )
     mock_worker_mgr.send(event_1)
 
     # simulate the app updating a model a few times
-    event_2 = OnWriteFeatureStore(mock_fs_descriptor, "key-1")
-    event_3 = OnWriteFeatureStore(mock_fs_descriptor, "key-2")
-    event_4 = OnWriteFeatureStore(mock_fs_descriptor, "key-1")
+    event_2 = OnWriteFeatureStore(
+        "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-1"
+    )
+    event_3 = OnWriteFeatureStore(
+        "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-2"
+    )
+    event_4 = OnWriteFeatureStore(
+        "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-1"
+    )
 
     mock_client_app.send(event_2)
     mock_client_app.send(event_3)
@@ -741,7 +791,7 @@ def test_eventconsumer_batch_timeout(
         consumer = EventConsumer(
             channel,
             backbone,
-            filters=[EventCategory.FEATURE_STORE_WRITTEN],
+            filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN],
         )
         consumer.recv(batch_timeout=invalid_timeout)
 
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index 895bc6467..69de23495 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -30,18 +30,16 @@
 
 dragon = pytest.importorskip("dragon")
 
-from smartsim._core.mli.comm.channel.dragon_channel import (
+from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragon_util import (
     DEFAULT_CHANNEL_BUFFER_SIZE,
-    DragonCommChannel,
+    create_local,
 )
-from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster
+from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer
+from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
-    EventBroadcaster,
-    EventCategory,
-    EventConsumer,
-    OnCreateConsumer,
-    OnWriteFeatureStore,
 )
 from smartsim._core.mli.infrastructure.storage.dragon_util import (
     create_ddict,
@@ -131,7 +129,9 @@ def test_eventconsumer_max_dequeue(
 
     # simulate the app updating a model a lot of times
     for key in (f"key-{i}" for i in range(num_events)):
-        event = OnWriteFeatureStore(the_backbone.descriptor, key)
+        event = OnWriteFeatureStore(
+            "test_eventconsumer_max_dequeue", the_backbone.descriptor, key
+        )
         mock_client_app.send(event, timeout=0.01)
 
     num_dequeued = 0
@@ -223,7 +223,9 @@ def test_channel_buffer_size(
 
     # simulate the app updating a model a lot of times
     for key in (f"key-{i}" for i in range(buffer_size)):
-        event = OnWriteFeatureStore(backbone.descriptor, key)
+        event = OnWriteFeatureStore(
+            "test_channel_buffer_size", backbone.descriptor, key
+        )
         mock_client_app.send(event, timeout=0.01)
 
     # adding 1 more over the configured buffer size should report the error
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index b871de267..bc4a69612 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -37,10 +37,10 @@
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster
+from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
-    EventBroadcaster,
-    OnWriteFeatureStore,
 )
 from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim.error.errors import SmartSimError
@@ -281,7 +281,7 @@ def test_protoclient_write_model_notification_sent(
     the_backbone[BackboneFeatureStore.MLI_BACKBONE] = the_backbone.descriptor
     the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_queue.descriptor
     the_backbone[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS] = ",".join(listeners)
-    the_backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = None
+    the_backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = None
 
     with monkeypatch.context() as ctx:
         ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
@@ -323,6 +323,9 @@ def test_protoclient_write_model_notification_sent(
             ), "Expected default timeout on call to `publisher.send`, "
 
             # confirm the correct event was raised
-            event = t.cast(OnWriteFeatureStore, pickle.loads(event_bytes))
+            event = t.cast(
+                OnWriteFeatureStore,
+                pickle.loads(event_bytes),
+            )
             assert event.descriptor == the_backbone.descriptor
             assert event.key == model_key
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index 819414eca..3372bc1ad 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -265,8 +265,10 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
 
     # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader
     # or test environment may be unable to send messages w/queue
-    os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor
-    os.environ["_SMARTSIM_INFRA_BACKBONE"] = backbone.descriptor
+    os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = (
+        to_worker_fli_comm_channel.descriptor
+    )
+    os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py
index e54cdf7fd..c658f2f26 100644
--- a/tests/dragon/utils/msg_pump.py
+++ b/tests/dragon/utils/msg_pump.py
@@ -96,7 +96,8 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
     """Create a simple torch model and persist to disk for
     testing purposes.
 
-    TODO: remove once unit tests are in place"""
+    :returns: Path to the model file
+    """
     # test_path = pathlib.Path(work_dir)
     if not model_path.parent.exists():
         model_path.parent.mkdir(parents=True, exist_ok=True)
diff --git a/tests/mli/test_service.py b/tests/mli/test_service.py
index 617738f94..3635f6ff7 100644
--- a/tests/mli/test_service.py
+++ b/tests/mli/test_service.py
@@ -27,6 +27,7 @@
 import datetime
 import multiprocessing as mp
 import pathlib
+import time
 import typing as t
 from asyncore import loop
 
@@ -47,23 +48,37 @@ class SimpleService(Service):
     def __init__(
         self,
         log: t.List[str],
-        quit_after: int = 0,
+        quit_after: int = -1,
         as_service: bool = False,
-        cooldown: int = 0,
-        loop_delay: int = 0,
+        cooldown: float = 0,
+        loop_delay: float = 0,
+        hc_freq: float = -1,
+        run_for: float = 0,
     ) -> None:
-        super().__init__(as_service, cooldown, loop_delay)
+        super().__init__(as_service, cooldown, loop_delay, hc_freq)
         self._log = log
         self._quit_after = quit_after
-        self.num_iterations = 0
         self.num_starts = 0
         self.num_shutdowns = 0
+        self.num_health_checks = 0
         self.num_cooldowns = 0
-        self.num_can_shutdown = 0
         self.num_delays = 0
+        self.num_iterations = 0
+        self.num_can_shutdown = 0
+        self.run_for = run_for
+        self.start_time = time.time()
 
-    def _on_iteration(self) -> None:
-        self.num_iterations += 1
+    @property
+    def runtime(self) -> float:
+        return time.time() - self.start_time
+
+    def _can_shutdown(self) -> bool:
+        self.num_can_shutdown += 1
+
+        if self._quit_after > -1 and self.num_iterations >= self._quit_after:
+            return True
+        if self.run_for > 0:
+            return self.runtime >= self.run_for
 
     def _on_start(self) -> None:
         self.num_starts += 1
@@ -71,16 +86,17 @@ def _on_start(self) -> None:
     def _on_shutdown(self) -> None:
         self.num_shutdowns += 1
 
+    def _on_health_check(self) -> None:
+        self.num_health_checks += 1
+
     def _on_cooldown_elapsed(self) -> None:
         self.num_cooldowns += 1
 
     def _on_delay(self) -> None:
         self.num_delays += 1
 
-    def _can_shutdown(self) -> bool:
-        self.num_can_shutdown += 1
-        if self._quit_after == 0:
-            return True
+    def _on_iteration(self) -> None:
+        self.num_iterations += 1
 
         return self.num_iterations >= self._quit_after
 
@@ -134,6 +150,7 @@ def test_service_run_until_can_shutdown(num_iterations: int) -> None:
         # no matter what, it should always execute the _on_iteration method
         assert service.num_iterations == 1
     else:
+        # the shutdown check follows on_iteration. there will be one last call
         assert service.num_iterations == num_iterations
 
     assert service.num_starts == 1
@@ -203,3 +220,71 @@ def test_service_delay(delay: int, num_iterations: int) -> None:
     assert duration_in_seconds <= expected_duration
     assert service.num_cooldowns == 0
     assert service.num_shutdowns == 1
+
+
+@pytest.mark.parametrize(
+    "health_check_freq, run_for",
+    [
+        pytest.param(1, 5.5, id="1s freq, 10x"),
+        pytest.param(5, 10.5, id="5s freq, 2x"),
+        pytest.param(0.1, 5.1, id="0.1s freq, 50x"),
+    ],
+)
+def test_service_health_check_freq(health_check_freq: float, run_for: float) -> None:
+    """Verify that a the health check frequency is honored
+
+    :param health_check_freq: The desired frequency of the health check
+    :pram run_for: A fixed duration to allow the service to run
+    """
+    activity_log: t.List[str] = []
+
+    service = SimpleService(
+        activity_log,
+        quit_after=-1,
+        as_service=True,
+        cooldown=0,
+        hc_freq=health_check_freq,
+        run_for=run_for,
+    )
+
+    ts0 = datetime.datetime.now()
+    service.execute()
+    ts1 = datetime.datetime.now()
+
+    # the expected duration is the sum of the delay between each iteration
+    expected_hc_count = run_for // health_check_freq
+
+    # allow some wiggle room for frequency comparison
+    assert expected_hc_count - 1 <= service.num_health_checks <= expected_hc_count + 1
+
+    assert service.num_cooldowns == 0
+    assert service.num_shutdowns == 1
+
+
+def test_service_health_check_freq_unbound() -> None:
+    """Verify that a health check frequency of zero is treated as
+    "always on" and is called each loop iteration
+
+    :param health_check_freq: The desired frequency of the health check
+    :pram run_for: A fixed duration to allow the service to run
+    """
+    health_check_freq: float = 0.0
+    run_for: float = 5
+
+    activity_log: t.List[str] = []
+
+    service = SimpleService(
+        activity_log,
+        quit_after=-1,
+        as_service=True,
+        cooldown=0,
+        hc_freq=health_check_freq,
+        run_for=run_for,
+    )
+
+    service.execute()
+
+    # allow some wiggle room for frequency comparison
+    assert service.num_health_checks == service.num_iterations
+    assert service.num_cooldowns == 0
+    assert service.num_shutdowns == 1
diff --git a/tests/test_dragon_comm_utils.py b/tests/test_dragon_comm_utils.py
index 06d6e19b3..a6f9c206a 100644
--- a/tests/test_dragon_comm_utils.py
+++ b/tests/test_dragon_comm_utils.py
@@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import base64
 import pathlib
 import uuid
 
@@ -35,6 +36,8 @@
 
 # isort: off
 import dragon.channels as dch
+import dragon.infrastructure.parameters as dp
+import dragon.managed_memory as dm
 import dragon.fli as fli
 
 # isort: on
@@ -47,6 +50,16 @@
 logger = get_logger(__name__)
 
 
+@pytest.fixture(scope="function")
+def the_pool() -> dm.MemoryPool:
+    """Creates a memory pool."""
+    raw_pool_descriptor = dp.this_process.default_pd
+    descriptor_ = base64.b64decode(raw_pool_descriptor)
+
+    pool = dm.MemoryPool.attach(descriptor_)
+    return pool
+
+
 @pytest.fixture(scope="function")
 def the_channel() -> dch.Channel:
     """Creates a Channel attached to the local memory pool."""
@@ -226,3 +239,19 @@ def test_descriptor_to_fli_happy_path(the_fli: dch.Channel) -> None:
 
     # and just make sure creation of the descriptor is transitive
     assert dragon_util.channel_to_descriptor(reattached) == descriptor
+
+
+def test_pool_to_descriptor_empty() -> None:
+    """Verify that `pool_to_descriptor` raises an exception when
+    provided with a null pool."""
+
+    with pytest.raises(ValueError) as ex:
+        dragon_util.pool_to_descriptor(None)
+
+
+def test_pool_to_happy_path(the_pool) -> None:
+    """Verify that `pool_to_descriptor` creates a descriptor
+    when supplied with a valid memory pool."""
+
+    descriptor = dragon_util.pool_to_descriptor(the_pool)
+    assert descriptor

From 27ab4f985a59868066d71e428e591ea344910f5a Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Thu, 3 Oct 2024 20:43:53 -0500
Subject: [PATCH 27/40] fix extra arg copypasta

---
 tests/dragon/test_featurestore_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py
index 4cadfd8f3..440b1f543 100644
--- a/tests/dragon/test_featurestore_base.py
+++ b/tests/dragon/test_featurestore_base.py
@@ -71,7 +71,7 @@ def test_event_uid() -> None:
     # generate a bunch of events and keep track all the IDs
     for i in range(num_iters):
         event_a = OnCreateConsumer("test_event_uid", str(i), filters=[])
-        event_b = OnWriteFeatureStore("test_event_uid", str(i), filters=[])
+        event_b = OnWriteFeatureStore("test_event_uid", str(i))
 
         uids.add(event_a.uid)
         uids.add(event_b.uid)

From fac30bde4b2b890c5b6c3446e67f9e718467490c Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Fri, 4 Oct 2024 11:21:59 -0500
Subject: [PATCH 28/40] test bugs

---
 smartsim/_core/mli/infrastructure/worker/worker.py | 2 +-
 tests/dragon/test_featurestore_base.py             | 2 +-
 tests/dragon/test_featurestore_integration.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 018703271..f1718f053 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -489,7 +489,7 @@ def fetch_model(
             feature_store = feature_stores[fsd]
             raw_bytes: bytes = t.cast(bytes, feature_store[key])
             return FetchModelResult(raw_bytes)
-        except FileNotFoundError as ex:
+        except (FileNotFoundError, KeyError) as ex:
             logger.exception(ex)
             raise SmartSimError(f"Model could not be retrieved with key {key}") from ex
 
diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py
index 440b1f543..6daceb906 100644
--- a/tests/dragon/test_featurestore_base.py
+++ b/tests/dragon/test_featurestore_base.py
@@ -71,7 +71,7 @@ def test_event_uid() -> None:
     # generate a bunch of events and keep track all the IDs
     for i in range(num_iters):
         event_a = OnCreateConsumer("test_event_uid", str(i), filters=[])
-        event_b = OnWriteFeatureStore("test_event_uid", str(i))
+        event_b = OnWriteFeatureStore("test_event_uid", "test_event_uid", str(i))
 
         uids.add(event_a.uid)
         uids.add(event_b.uid)
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index 69de23495..e9fa3d5dd 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -114,7 +114,7 @@ def test_eventconsumer_max_dequeue(
     wmgr_consumer = EventConsumer(
         the_worker_channel,
         the_backbone,
-        filters=[EventCategory.FEATURE_STORE_WRITTEN],
+        filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN],
     )
 
     # create a broadcaster to publish messages

From a1cf7ff61b36e12af28d4c28aae52576bd387481 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Mon, 7 Oct 2024 14:02:31 -0500
Subject: [PATCH 29/40] Remove from_sender_supplied_descriptor factory on FLI
 channel

---
 .../standalone_worker_manager.py              |  2 +-
 smartsim/_core/mli/comm/channel/dragon_fli.py | 38 +++++--------------
 tests/dragon/test_environment_loader.py       |  2 +-
 tests/dragon/test_error_handling.py           | 10 ++---
 tests/dragon/test_event_consumer.py           |  2 +-
 tests/dragon/test_featurestore.py             |  2 +-
 tests/dragon/test_protoclient.py              |  2 +-
 tests/dragon/test_request_dispatcher.py       |  4 +-
 tests/dragon/test_worker_manager.py           |  8 ++--
 tests/dragon/utils/msg_pump.py                |  4 +-
 10 files changed, 26 insertions(+), 48 deletions(-)

diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py
index 9a3926803..b4527bc5d 100644
--- a/ex/high_throughput_inference/standalone_worker_manager.py
+++ b/ex/high_throughput_inference/standalone_worker_manager.py
@@ -141,7 +141,7 @@ def service_as_dragon_proc(
 
     to_worker_channel = create_local()
     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-    to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, True)
+    to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli)
 
     backbone.worker_queue = to_worker_fli_comm_ch.descriptor
 
diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py
index d7787f2ca..aa9be8897 100644
--- a/smartsim/_core/mli/comm/channel/dragon_fli.py
+++ b/smartsim/_core/mli/comm/channel/dragon_fli.py
@@ -46,8 +46,7 @@ class DragonFLIChannel(cch.CommChannelBase):
     def __init__(
         self,
         fli_: fli.FLInterface,
-        sender_supplied: bool = True,
-        buffer_size: int = 0,
+        buffer_size: int = drg_util.DEFAULT_CHANNEL_BUFFER_SIZE,
     ) -> None:
         """Initialize the DragonFLIChannel instance.
 
@@ -60,11 +59,11 @@ def __init__(
 
         self._fli = fli_
         """The underlying dragon FLInterface used by this CommChannel for communications"""
-        self._channel: t.Optional["dch.Channel"] = (
-            drg_util.create_local(buffer_size) if sender_supplied else None
-        )
+        self._channel: t.Optional["dch.Channel"] = None
         """The underlying dragon Channel used by a sender-side DragonFLIChannel
         to attach to the main FLI channel"""
+        self._buffer_size: int = buffer_size
+        """Maximum number of messages that can be buffered before sending"""
 
     def send(self, value: bytes, timeout: float = 0.001) -> None:
         """Send a message through the underlying communication channel.
@@ -74,10 +73,14 @@ def send(self, value: bytes, timeout: float = 0.001) -> None:
         :raises SmartSimError: If sending message fails
         """
         try:
+            if self._channel is None:
+                self._channel = drg_util.create_local(self._buffer_size)
+
             with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh:
                 sendh.send_bytes(value, timeout=timeout)
                 logger.debug(f"DragonFLIChannel {self.descriptor} sent message")
         except Exception as e:
+            self._channel = None
             raise SmartSimError(
                 f"Error sending via DragonFLIChannel {self.descriptor}"
             ) from e
@@ -106,26 +109,6 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]:
                     ) from e
         return messages
 
-    @classmethod
-    def from_sender_supplied_descriptor(
-        cls,
-        descriptor: str,
-    ) -> "DragonFLIChannel":
-        """A factory method that creates an instance from a descriptor string
-
-        :param descriptor: the descriptor of the main FLI channel to attach
-        :returns: An attached DragonFLIChannel"""
-        try:
-            return DragonFLIChannel(
-                fli_=drg_util.descriptor_to_fli(descriptor),
-                sender_supplied=True,
-            )
-        except:
-            logger.error(
-                f"Error while creating sender supplied DragonFLIChannel: {descriptor}"
-            )
-            raise
-
     @classmethod
     def from_descriptor(
         cls,
@@ -142,10 +125,7 @@ def from_descriptor(
             raise ValueError("Invalid descriptor provided")
 
         try:
-            return DragonFLIChannel(
-                fli_=drg_util.descriptor_to_fli(descriptor),
-                sender_supplied=False,
-            )
+            return DragonFLIChannel(fli_=drg_util.descriptor_to_fli(descriptor))
         except Exception as e:
             raise SmartSimError(
                 f"Error while creating DragonFLIChannel: {descriptor}"
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index e9f6004d1..aed1b0ae4 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -71,7 +71,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
     config_queue = config.get_queue()
 
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index a7ba7e7f2..8421999a1 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -94,7 +94,7 @@ def the_worker_channel() -> DragonFLIChannel:
     that can be attached to."""
     channel_ = create_local()
     fli_ = FLInterface(main_ch=channel_, manager_ch=None)
-    comm_channel = DragonFLIChannel(fli_, True)
+    comm_channel = DragonFLIChannel(fli_)
     return comm_channel
 
 
@@ -132,7 +132,7 @@ def setup_worker_manager_model_bytes(
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
 
     dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0)
@@ -190,7 +190,7 @@ def setup_worker_manager_model_key(
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
 
     dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0)
@@ -246,7 +246,7 @@ def setup_request_dispatcher_model_bytes(
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
 
     request_dispatcher = RequestDispatcher(
@@ -289,7 +289,7 @@ def setup_request_dispatcher_model_key(
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
 
     request_dispatcher = RequestDispatcher(
diff --git a/tests/dragon/test_event_consumer.py b/tests/dragon/test_event_consumer.py
index bda8f33cd..8c752c372 100644
--- a/tests/dragon/test_event_consumer.py
+++ b/tests/dragon/test_event_consumer.py
@@ -79,7 +79,7 @@ def the_worker_channel() -> DragonFLIChannel:
     that can be attached to. Does not modify environment vars."""
     channel_ = create_local()
     fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
-    comm_channel = DragonFLIChannel(fli_, True)
+    comm_channel = DragonFLIChannel(fli_)
     return comm_channel
 
 
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index 9156979ed..a97accd64 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -75,7 +75,7 @@ def the_worker_channel() -> DragonFLIChannel:
     that can be attached to. Does not modify environment vars."""
     channel_ = create_local()
     fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
-    comm_channel = DragonFLIChannel(fli_, True)
+    comm_channel = DragonFLIChannel(fli_)
     return comm_channel
 
 
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index bc4a69612..6885acc96 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -94,7 +94,7 @@ def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
     # create the FLI
     to_worker_channel = create_local()
     fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-    comm_channel = DragonFLIChannel(fli_, True)
+    comm_channel = DragonFLIChannel(fli_)
 
     # store the descriptor in the backbone
     the_backbone.worker_queue = comm_channel.descriptor
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index 82f41e3db..b8b725f79 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -104,7 +104,7 @@ def test_request_dispatcher(
 
     to_worker_channel = create_local()
     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-    to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, sender_supplied=True)
+    to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli)
 
     backbone_fs = BackboneFeatureStore(the_storage, allow_reserved_writes=True)
 
@@ -116,7 +116,7 @@ def test_request_dispatcher(
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
 
     request_dispatcher = RequestDispatcher(
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index 3372bc1ad..4047a731f 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -149,7 +149,7 @@ def mock_messages(
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
     backbone = config_loader.get_backbone()
 
@@ -212,7 +212,7 @@ def mock_mli_infrastructure_mgr() -> None:
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
 
     integrated_worker = TorchWorker
@@ -261,7 +261,7 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
     to_worker_channel = create_local()
     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
 
-    to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, sender_supplied=True)
+    to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli)
 
     # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader
     # or test environment may be unable to send messages w/queue
@@ -273,7 +273,7 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_sender_supplied_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
     integrated_worker_type = TorchWorker
 
diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py
index c658f2f26..835bccd2b 100644
--- a/tests/dragon/utils/msg_pump.py
+++ b/tests/dragon/utils/msg_pump.py
@@ -122,9 +122,7 @@ def mock_messages(
     offset = 2 * parent_iteration
 
     feature_store = BackboneFeatureStore.from_descriptor(fs_descriptor)
-    request_dispatcher_queue = DragonFLIChannel.from_sender_supplied_descriptor(
-        dispatch_fli_descriptor
-    )
+    request_dispatcher_queue = DragonFLIChannel.from_descriptor(dispatch_fli_descriptor)
 
     for iteration_number in range(2):
         logged_iteration = offset + iteration_number

From 124a195100d357936e370e05f2971a917f2a8079 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Mon, 7 Oct 2024 14:47:58 -0500
Subject: [PATCH 30/40] re-home protoclient into mli subpackage

---
 ex/high_throughput_inference/mock_app.py       | 2 +-
 smartsim/_core/mli/client/__init__.py          | 0
 smartsim/{ => _core/mli/client}/protoclient.py | 0
 tests/dragon/test_protoclient.py               | 2 +-
 4 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 smartsim/_core/mli/client/__init__.py
 rename smartsim/{ => _core/mli/client}/protoclient.py (100%)

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 876f9145a..c3b3eaaf4 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -51,7 +51,7 @@
 from collections import OrderedDict
 
 from smartsim.log import get_logger, log_to_file
-from smartsim.protoclient import ProtoClient
+from smartsim._core.mli.client.protoclient import ProtoClient
 
 logger = get_logger("App")
 
diff --git a/smartsim/_core/mli/client/__init__.py b/smartsim/_core/mli/client/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/smartsim/protoclient.py b/smartsim/_core/mli/client/protoclient.py
similarity index 100%
rename from smartsim/protoclient.py
rename to smartsim/_core/mli/client/protoclient.py
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index 6885acc96..b02859f51 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -51,7 +51,7 @@
 from dragon.data.ddict.ddict import DDict
 
 # from ..ex..high_throughput_inference.mock_app import ProtoClient
-from smartsim.protoclient import ProtoClient
+from smartsim._core.mli.client.protoclient import ProtoClient
 
 
 # The tests in this file belong to the dragon group

From b2c4cb7f3ee53d79ca77906661931f4acb01d27f Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Mon, 7 Oct 2024 16:53:05 -0500
Subject: [PATCH 31/40] additional missed docstrings

---
 smartsim/_core/entrypoints/service.py         | 10 +--
 .../_core/launcher/dragon/dragonBackend.py    |  4 +-
 .../_core/launcher/dragon/dragonConnector.py  | 74 ++++++++++++++++++-
 .../mli/infrastructure/comm/broadcaster.py    |  5 +-
 .../_core/mli/infrastructure/comm/consumer.py |  2 -
 5 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py
index 497bdda2f..12b0cdd4d 100644
--- a/smartsim/_core/entrypoints/service.py
+++ b/smartsim/_core/entrypoints/service.py
@@ -35,9 +35,9 @@
 
 
 class Service(ABC):
-    """Base contract for standalone entrypoint scripts. Defines API for entrypoint
-    behaviors (event loop, automatic shutdown, cooldown) as well as simple
-    hooks for status changes"""
+    """Core API for standalone entrypoint scripts. Makes use of overridable hook
+    methods to modify behaviors (event loop, automatic shutdown, cooldown) as
+    well as simple hooks for status changes"""
 
     def __init__(
         self,
@@ -46,7 +46,7 @@ def __init__(
         loop_delay: float = 0,
         health_check_frequency: float = 0,
     ) -> None:
-        """Initialize the ServiceHost
+        """Initialize the Service
 
         :param as_service: Determines if the host runs continuously until
         shutdown criteria are met, or executes the service lifecycle once and exits
@@ -83,7 +83,7 @@ def _can_shutdown(self) -> bool:
 
     def _on_start(self) -> None:
         """Empty hook method for use by subclasses. Called on initial entry into
-        ServiceHost `execute` event loop before `_on_iteration` is invoked."""
+        Service `execute` event loop before `_on_iteration` is invoked."""
         logger.debug(f"Starting {self.__class__.__name__}")
 
     def _on_shutdown(self) -> None:
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index fb33460d8..45d646bf5 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -100,7 +100,7 @@ class ProcessGroupInfo:
     return_codes: t.Optional[t.List[int]] = None
     """List of return codes of completed processes"""
     hosts: t.List[str] = field(default_factory=list)
-    """List of hosts on which the Process Group """
+    """List of hosts on which the Process Group should be executed"""
     redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None
     """Workers used to redirect stdout and stderr to file"""
 
@@ -593,7 +593,7 @@ def start_event_listener(
         """Start a standalone event listener.
 
         :param cpu_affinity: The CPU affinity for the process
-        :param gpu_affinity: The CPU affinity for the process
+        :param gpu_affinity: The GPU affinity for the process
         :returns: The dragon Process managing the process
         :raises SmartSimError: If the backbone is not provided
         """
diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py
index 98670f347..1144b7764 100644
--- a/smartsim/_core/launcher/dragon/dragonConnector.py
+++ b/smartsim/_core/launcher/dragon/dragonConnector.py
@@ -71,17 +71,23 @@ class DragonConnector:
 
     def __init__(self) -> None:
         self._context: zmq.Context[t.Any] = zmq.Context.instance()
+        """ZeroMQ context used to share configuration across requests"""
         self._context.setsockopt(zmq.REQ_CORRELATE, 1)
         self._context.setsockopt(zmq.REQ_RELAXED, 1)
         self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None
+        """ZeroMQ authenticator used to secure queue access"""
         config = get_config()
         self._reset_timeout(config.dragon_server_timeout)
         self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None
+        """ZeroMQ socket exposing the connection to the DragonBackend"""
         self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None
+        """A handle to the process executing the DragonBackend"""
         # Returned by dragon head, useful if shutdown is to be requested
         # but process was started by another connector
         self._dragon_head_pid: t.Optional[int] = None
+        """Process ID of the process executing the DragonBackend"""
         self._dragon_server_path = config.dragon_server_path
+        """Path to a dragon installation"""
         logger.debug(f"Dragon Server path was set to {self._dragon_server_path}")
         self._env_vars: t.Dict[str, str] = {}
         if self._dragon_server_path is None:
@@ -95,7 +101,7 @@ def __init__(self) -> None:
 
     @property
     def is_connected(self) -> bool:
-        """Whether the Connector established a connection to the server
+        """Whether the Connector established a connection to the server.
 
         :return: True if connected
         """
@@ -104,12 +110,18 @@ def is_connected(self) -> bool:
     @property
     def can_monitor(self) -> bool:
         """Whether the Connector knows the PID of the dragon server head process
-        and can monitor its status
+        and can monitor its status.
 
         :return: True if the server can be monitored"""
         return self._dragon_head_pid is not None
 
     def _handshake(self, address: str) -> None:
+        """Perform the handshake process with the DragonBackend and
+        confirm two-way communication is established.
+
+        :param address: The address of the head node socket to initiate a
+        handhake with
+        """
         self._dragon_head_socket = dragonSockets.get_secure_socket(
             self._context, zmq.REQ, False
         )
@@ -132,6 +144,11 @@ def _handshake(self, address: str) -> None:
             ) from e
 
     def _reset_timeout(self, timeout: int = get_config().dragon_server_timeout) -> None:
+        """Reset the timeout applied to the ZMQ context. If an authenticator is
+        enabled, also update the authenticator timeouts.
+
+        :param timeout: The timeout value to apply to ZMQ sockets
+        """
         self._context.setsockopt(zmq.SNDTIMEO, value=timeout)
         self._context.setsockopt(zmq.RCVTIMEO, value=timeout)
         if self._authenticator is not None and self._authenticator.thread is not None:
@@ -183,11 +200,19 @@ def _get_new_authenticator(
 
     @staticmethod
     def _get_dragon_log_level() -> str:
+        """Maps the log level from SmartSim to a valid log level
+        for a dragon process.
+
+        :returns: The dragon log level string
+        """
         smartsim_to_dragon = defaultdict(lambda: "NONE")
         smartsim_to_dragon["developer"] = "INFO"
         return smartsim_to_dragon.get(get_config().log_level, "NONE")
 
     def _connect_to_existing_server(self, path: Path) -> None:
+        """Connects to an existing DragonBackend using address information from
+        a persisted dragon log file.
+        """
         config = get_config()
         dragon_config_log = path / config.dragon_log_filename
 
@@ -217,6 +242,11 @@ def _connect_to_existing_server(self, path: Path) -> None:
                 return
 
     def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]:
+        """Instantiate the ZMQ socket to be used by the connector.
+
+        :param socket_addr: The socket address the connector should bind to
+        :returns: The bound socket
+        """
         config = get_config()
         connector_socket: t.Optional[zmq.Socket[t.Any]] = None
         self._reset_timeout(config.dragon_server_startup_timeout)
@@ -423,6 +453,15 @@ def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse
     def _parse_launched_dragon_server_info_from_iterable(
         stream: t.Iterable[str], num_dragon_envs: t.Optional[int] = None
     ) -> t.List[t.Dict[str, str]]:
+        """Parses dragon backend connection information from a stream.
+
+        :param stream: The stream to inspect. Usually the stdout of the
+        DragonBackend process
+        :param num_dragon_envs: The expected number of dragon environments
+        to parse from the stream.
+        :returns: A list of dictionaries, one per environment, containing
+        the parsed server information
+        """
         lines = (line.strip() for line in stream)
         lines = (line for line in lines if line)
         tokenized = (line.split(maxsplit=1) for line in lines)
@@ -449,6 +488,15 @@ def _parse_launched_dragon_server_info_from_files(
         file_paths: t.List[t.Union[str, "os.PathLike[str]"]],
         num_dragon_envs: t.Optional[int] = None,
     ) -> t.List[t.Dict[str, str]]:
+        """Read a known log file into a Stream and parse dragon server configuration
+        from the stream.
+
+        :param file_paths: Path to a file containing dragon server configuration
+        :num_dragon_envs: The expected number of dragon environments to be found
+        in the file
+        :returns: The parsed server configuration, one item per
+        discovered dragon environment
+        """
         with fileinput.FileInput(file_paths) as ifstream:
             dragon_envs = cls._parse_launched_dragon_server_info_from_iterable(
                 ifstream, num_dragon_envs
@@ -463,6 +511,15 @@ def _send_req_with_socket(
         send_flags: int = 0,
         recv_flags: int = 0,
     ) -> DragonResponse:
+        """Sends a synchronous request through a ZMQ socket.
+
+        :param socket: Socket to send on
+        :param request: The request to send
+        :param send_flags: Configuration to apply to the send operation
+        :param recv_flags: Configuration to apply to the recv operation; used to
+        allow the receiver to immediately respond to the sent request.
+        :returns: The response from the target
+        """
         client = dragonSockets.as_client(socket)
         with DRG_LOCK:
             logger.debug(f"Sending {type(request).__name__}: {request}")
@@ -474,6 +531,13 @@ def _send_req_with_socket(
 
 
 def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT:
+    """Verify that objects can be sent as messages acceptable to the target.
+
+    :param obj: The message to test
+    :param typ: The type that is acceptable
+    :returns: The original `obj` if it is of the requested type
+    :raises TypeError: If the object fails the test and is not
+    an instance of the desired type"""
     if not isinstance(obj, typ):
         raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}")
     return obj
@@ -525,6 +589,12 @@ def _dragon_cleanup(
 
 
 def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path:
+    """Determine the applicable dragon server path for the connector
+
+    :param fallback: A default dragon server path to use if one is not
+    found in the runtime configuration
+    :returns: The path to the dragon libraries
+    """
     dragon_server_path = get_config().dragon_server_path or os.path.join(
         fallback, ".smartsim", "dragon"
     )
diff --git a/smartsim/_core/mli/infrastructure/comm/broadcaster.py b/smartsim/_core/mli/infrastructure/comm/broadcaster.py
index d813cce12..cd8c45745 100644
--- a/smartsim/_core/mli/infrastructure/comm/broadcaster.py
+++ b/smartsim/_core/mli/infrastructure/comm/broadcaster.py
@@ -61,6 +61,8 @@ def __init__(
 
         :param backbone: The MLI backbone feature store
         :param channel_factory: Factory method to construct new channel instances
+        :param name: A unique identifer assigned to the broadcaster for logging. If
+         not provided, the system will auto-assign one.
         """
         self._backbone = backbone
         """The backbone feature store used to retrieve consumer descriptors"""
@@ -178,8 +180,7 @@ def _broadcast(self, timeout: float = 0.001) -> BroadcastResult:
         :param timeout: Maximum time to wait (in seconds) for messages to send
         :returns: BroadcastResult containing the number of messages that were
         successfully and unsuccessfully sent for all consumers
-        :raises SmartSimError: If the channel fails to attach
-        :raises SmartSimError: If broadcasting fails
+        :raises SmartSimError: If the channel fails to attach or broadcasting fails
         """
         # allow descriptors to be empty since events are buffered
         self._descriptors = set(x for x in self._backbone.notification_channels if x)
diff --git a/smartsim/_core/mli/infrastructure/comm/consumer.py b/smartsim/_core/mli/infrastructure/comm/consumer.py
index 3e03ba86c..08b5c4785 100644
--- a/smartsim/_core/mli/infrastructure/comm/consumer.py
+++ b/smartsim/_core/mli/infrastructure/comm/consumer.py
@@ -54,7 +54,6 @@ class EventConsumer:
     def __init__(
         self,
         comm_channel: CommChannelBase,
-        # channel_factory: ...,
         backbone: BackboneFeatureStore,
         filters: t.Optional[t.List[str]] = None,
         name: t.Optional[str] = None,
@@ -68,7 +67,6 @@ def __init__(
         events will be delivered
         :param name: A user-friendly name for logging. If not provided, an
         auto-generated GUID will be used
-        :raises ValueError: If batch_timeout <= 0
         """
         self._comm_channel = comm_channel
         """The comm channel used by the consumer to receive messages. The channel

From 0c495f6a81347d47c785da028fc2fa3f4b213b1a Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 8 Oct 2024 11:46:36 -0500
Subject: [PATCH 32/40] improve service param docstrings, avoid separate
 var/param descriptions

---
 smartsim/_core/entrypoints/service.py | 35 +++++++++++++++++----------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py
index 12b0cdd4d..719c2a60f 100644
--- a/smartsim/_core/entrypoints/service.py
+++ b/smartsim/_core/entrypoints/service.py
@@ -48,26 +48,35 @@ def __init__(
     ) -> None:
         """Initialize the Service
 
-        :param as_service: Determines if the host runs continuously until
-        shutdown criteria are met, or executes the service lifecycle once and exits
+        :param as_service: Determines lifetime of the service. When `True`, calling
+        execute on the service will run continuously until shutdown criteria are met.
+        Otherwise, `execute` performs a single pass through the service lifecycle and
+        automatically exits (regardless of the result of `_can_shutdown`).
         :param cooldown: Period of time (in seconds) to allow the service to run
-         after a shutdown is permitted. Enables the service to avoid restarting if
-         new work is discovered. A value of 0 disables the cooldown.
-        :param loop_delay: Time (in seconds) between iterations of the event loop
+        after a shutdown is permitted. Enables the service to avoid restarting if
+        new work is discovered. A value of 0 disables the cooldown.
+        :param loop_delay: Duration (in seconds) of a forced delay between
+        iterations of the event loop
         :param health_check_frequency: Time (in seconds) between calls to a
-         health check handler. A value of 0 triggers the health check on every
-         iteration.
+        health check handler. A value of 0 triggers the health check on every
+        iteration.
         """
         self._as_service = as_service
-        """If the service should run until shutdown function returns True"""
+        """Determines lifetime of the service. When `True`, calling
+         `execute` on the service will run continuously until shutdown criteria are met.
+         Otherwise, `execute` performs a single pass through the service lifecycle and
+         automatically exits (regardless of the result of `_can_shutdown`)."""
         self._cooldown = abs(cooldown)
-        """Duration of a cooldown period between requests to the service
-        before shutdown"""
+        """Period of time (in seconds) to allow the service to run
+        after a shutdown is permitted. Enables the service to avoid restarting if
+        new work is discovered. A value of 0 disables the cooldown."""
         self._loop_delay = abs(loop_delay)
-        """Forced delay between iterations of the event loop"""
+        """Duration (in seconds) of a forced delay between
+         iterations of the event loop"""
         self._health_check_frequency = health_check_frequency
-        """The time (in seconds) between desired health checks. Frequency of 0
-        will trigger the health check on every event loop iteration."""
+        """Time (in seconds) between calls to a
+         health check handler. A value of 0 triggers the health check on every
+         iteration."""
         self._last_health_check = time.time()
         """The timestamp of the latest health check"""
 

From 1fc59e4954fa2192406623f34f101345d355ba9b Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 8 Oct 2024 12:21:53 -0500
Subject: [PATCH 33/40] docstring fixes

---
 smartsim/_core/launcher/dragon/dragonBackend.py       |  4 ++--
 smartsim/_core/mli/infrastructure/comm/broadcaster.py |  4 ++--
 smartsim/_core/mli/infrastructure/control/listener.py |  2 --
 .../mli/infrastructure/control/request_dispatcher.py  |  1 -
 smartsim/_core/mli/infrastructure/worker/worker.py    | 11 +++++++----
 smartsim/_core/mli/message_handler.py                 |  3 +++
 6 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 45d646bf5..9f0473d0a 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -160,7 +160,7 @@ class DragonBackend:
     _DEFAULT_NUM_MGR_PER_NODE = 2
     """The default number of manager processes for each feature store node"""
     _DEFAULT_MEM_PER_NODE = 256 * 1024**2
-    """The default memory capacity to allocate for a feaure store node (in megabytes)"""
+    """The default memory capacity (in bytes) to allocate for a feaure store node"""
 
     def __init__(self, pid: int) -> None:
         self._pid = pid
@@ -555,7 +555,7 @@ def _create_backbone(self) -> BackboneFeatureStore:
         environment variables of this process to include the backbone
         descriptor.
 
-        :returns: The descriptor of the backbone feature store
+        :returns: The backbone feature store
         """
         if self._backbone is None:
             backbone_storage = create_ddict(
diff --git a/smartsim/_core/mli/infrastructure/comm/broadcaster.py b/smartsim/_core/mli/infrastructure/comm/broadcaster.py
index cd8c45745..56dcf549f 100644
--- a/smartsim/_core/mli/infrastructure/comm/broadcaster.py
+++ b/smartsim/_core/mli/infrastructure/comm/broadcaster.py
@@ -61,8 +61,8 @@ def __init__(
 
         :param backbone: The MLI backbone feature store
         :param channel_factory: Factory method to construct new channel instances
-        :param name: A unique identifer assigned to the broadcaster for logging. If
-         not provided, the system will auto-assign one.
+        :param name: A user-friendly name for logging. If not provided, an
+        auto-generated GUID will be used
         """
         self._backbone = backbone
         """The backbone feature store used to retrieve consumer descriptors"""
diff --git a/smartsim/_core/mli/infrastructure/control/listener.py b/smartsim/_core/mli/infrastructure/control/listener.py
index b5c529615..56a7b12d3 100644
--- a/smartsim/_core/mli/infrastructure/control/listener.py
+++ b/smartsim/_core/mli/infrastructure/control/listener.py
@@ -242,8 +242,6 @@ def _create_eventing(self) -> EventConsumer:
         Create an event publisher and event consumer for communicating with
         other MLI resources.
 
-        :param backbone: The backbone feature store used by the MLI backend.
-
         NOTE: the backbone must be initialized before connecting eventing clients.
 
         :returns: The newly created EventConsumer instance
diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
index b0f931cb3..3cc8f88da 100644
--- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
@@ -227,7 +227,6 @@ def __init__(
         :param config_loader: Object to load configuration from environment
         :param worker_type: Type of worker to instantiate to batch inputs
         :param mem_pool_size: Size of the memory pool used to allocate tensors
-        :raises SmartSimError: If config_loaded.get_queue() does not return a channel
         """
         super().__init__(as_service=True, cooldown=1)
         self._queues: dict[str, list[BatchQueue]] = {}
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index f1718f053..9556b8e43 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -200,6 +200,7 @@ def __init__(self, model: t.Any) -> None:
         :param model: The loaded model
         """
         self.model = model
+        """The loaded model (e.g. a TensorFlow, PyTorch, ONNX, etc. model)"""
 
 
 class TransformInputResult:
@@ -549,7 +550,7 @@ def place_output(
         feature store.
 
         :param request: The request that triggered the pipeline
-        :param execute_result: Results from inference
+        :param transform_result: Transformed version of the inference result
         :param feature_stores: Available feature stores used for persistence
         :returns: A collection of keys that were placed in the feature store
         :raises ValueError: If a feature store is not provided
@@ -579,10 +580,12 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC):
     def load_model(
         batch: RequestBatch, fetch_result: FetchModelResult, device: str
     ) -> LoadModelResult:
-        """Given a loaded MachineLearningModel, ensure it is loaded into
-        device memory.
+        """Given the raw bytes of an ML model that were fetched, ensure
+        it is loaded into device memory.
 
         :param request: The request that triggered the pipeline
+        :param fetch_result: The result of a fetch-model operation; contains
+        the raw bytes of the ML model.
         :param device: The device on which the model must be placed
         :returns: LoadModelResult wrapping the model loaded for the request
         :raises ValueError: If model reference object is not found
@@ -599,7 +602,7 @@ def transform_input(
         """Given a collection of data, perform a transformation on the data and put
         the raw tensor data on a MemoryPool allocation.
 
-        :param request: The request that triggered the pipeline
+        :param batch: The request that triggered the pipeline
         :param fetch_result: Raw outputs from fetching inputs out of a feature store
         :param mem_pool: The memory pool used to access batched input tensors
         :returns: The transformed inputs wrapped in a TransformInputResult
diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py
index d7324e4a4..2511e9d25 100644
--- a/smartsim/_core/mli/message_handler.py
+++ b/smartsim/_core/mli/message_handler.py
@@ -35,6 +35,9 @@
 
 
 class MessageHandler:
+    """Utility methods for transforming capnproto messages to and from
+    internal representations.
+    """
     @staticmethod
     def build_tensor_descriptor(
         order: "tensor_capnp.Order",

From b31612864694d7af3736c96765c8e1db0f27a46e Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 8 Oct 2024 12:29:56 -0500
Subject: [PATCH 34/40] increase default memory per backbone node

---
 smartsim/_core/launcher/dragon/dragonBackend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 9f0473d0a..5e0129914 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -159,7 +159,7 @@ class DragonBackend:
 
     _DEFAULT_NUM_MGR_PER_NODE = 2
     """The default number of manager processes for each feature store node"""
-    _DEFAULT_MEM_PER_NODE = 256 * 1024**2
+    _DEFAULT_MEM_PER_NODE = 512 * 1024**2
     """The default memory capacity (in bytes) to allocate for a feaure store node"""
 
     def __init__(self, pid: int) -> None:

From 2ed47a4129c7c1d66096783dd9bb6c04bedf1692 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 8 Oct 2024 13:02:23 -0500
Subject: [PATCH 35/40] fix fixture usage bug (worker queue preloaded into
 backbone)

---
 tests/dragon/test_protoclient.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index b02859f51..fff5fac47 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -138,6 +138,10 @@ def test_protoclient_timeout(
 
     with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex:
         start_time = time.time()
+        # remove the worker queue value from the backbone if it exists
+        # to ensure the timeout occurs
+        the_backbone.pop(BackboneFeatureStore.MLI_WORKER_QUEUE)
+
         ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor)
 
         ProtoClient(timing_on=False, backbone_timeout=backbone_timeout)

From 8ccebb53690d9060f544fe1feb0d3524f7c85dc1 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 8 Oct 2024 13:54:41 -0500
Subject: [PATCH 36/40] extract dragon fixtures into dragon conftest.py

---
 conftest.py                                   |  78 --------
 tests/dragon/conftest.py                      | 172 ++++++++++++++++++
 tests/dragon/test_dragon_ddict_utils.py       |   6 -
 tests/dragon/test_environment_loader.py       |   7 -
 tests/dragon/test_error_handling.py           |  25 ---
 tests/dragon/test_event_consumer.py           |  35 ----
 tests/dragon/test_featurestore.py             |  36 ----
 tests/dragon/test_featurestore_integration.py |  20 --
 tests/dragon/test_protoclient.py              |  22 ---
 tests/dragon/test_request_dispatcher.py       |   8 +-
 10 files changed, 173 insertions(+), 236 deletions(-)
 create mode 100644 tests/dragon/conftest.py

diff --git a/conftest.py b/conftest.py
index 7302482e6..54a47f9e2 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1022,81 +1022,3 @@ def _prepare_db(db_config: DBConfiguration) -> PrepareDatabaseOutput:
 
         return PrepareDatabaseOutput(db, new_db)
     return _prepare_db
-
-
-class MsgPumpRequest(t.NamedTuple):
-    """Fields required for starting a simulated inference request producer."""
-
-    backbone_descriptor: str
-    """The descriptor to use when connecting the message pump to a 
-    backbone featurestore.
-    
-    Passed to the message pump as `--fs-descriptor`
-    """
-    work_queue_descriptor: str
-    """The descriptor to use for sending work from the pump to the worker manager.
-    
-    Passed to the message pump as `--dispatch-fli-descriptor`
-    """
-    callback_descriptor: str
-    """The descriptor the worker should use to returning results.
-    
-    Passed to the message pump as `--callback-descriptor`
-    """
-    iteration_index: int = 1
-    """If calling the message pump repeatedly, supply an iteration index to ensure
-    that logged messages appear unique instead of apparing to be duplicated logs.
-    
-    Passed to the message pump as `--parent-iteration`
-    """
-
-    def as_command(self) -> t.List[str]:
-        """Produce CLI arguments suitable for calling subprocess.Popen that
-        to execute the msg pump.
-
-        NOTE: does NOT include the `[sys.executable, msg_pump_path, ...]`
-        portion of the necessary parameters to Popen.
-
-        :returns: The arguments of the request formatted appropriately to
-        Popen the `<project_dir>/tests/dragon/utils/msg_pump.py`"""
-        return [
-            "--dispatch-fli-descriptor",
-            self.work_queue_descriptor,
-            "--fs-descriptor",
-            self.backbone_descriptor,
-            "--parent-iteration",
-            str(self.iteration_index),
-            "--callback-descriptor",
-            self.callback_descriptor,
-        ]
-
-
-@pytest.fixture(scope="session")
-def msg_pump_factory() -> t.Callable[[MsgPumpRequest], subprocess.Popen]:
-    """A pytest fixture used to create a mock event producer capable of
-    feeding asynchronous inference requests to tests requiring them.
-
-    :returns: A function that opens a subprocess running a mock message pump
-    """
-
-    def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen:
-        """Invoke the message pump entry-point with the descriptors
-        from the request.
-
-        :param request: A request containing all parameters required to
-        invoke the message pump entrypoint
-        :returns: The Popen object for the subprocess that was started"""
-        # <smartsim_dir>/tests/dragon/utils/msg_pump.py
-        msg_pump_script = "tests/dragon/utils/msg_pump.py"
-        msg_pump_path = pathlib.Path(__file__).parent / msg_pump_script
-
-        cmd = [sys.executable, str(msg_pump_path.absolute()), *request.as_command()]
-
-        popen = subprocess.Popen(
-            args=cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        return popen
-
-    return run_message_pump
diff --git a/tests/dragon/conftest.py b/tests/dragon/conftest.py
new file mode 100644
index 000000000..3084a2f38
--- /dev/null
+++ b/tests/dragon/conftest.py
@@ -0,0 +1,172 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import annotations
+
+import pathlib
+import subprocess
+import sys
+import typing as t
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+# isort: off
+import dragon.data.ddict.ddict as dragon_ddict
+
+from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
+from dragon.fli import FLInterface
+from dragon.mpbridge.queues import DragonQueue
+
+# isort: on
+
+from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.dragon_util import create_local
+from smartsim._core.mli.infrastructure.storage import dragon_util
+from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
+    BackboneFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
+    DragonFeatureStore,
+)
+
+class MsgPumpRequest(t.NamedTuple):
+    """Fields required for starting a simulated inference request producer."""
+
+    backbone_descriptor: str
+    """The descriptor to use when connecting the message pump to a 
+    backbone featurestore.
+    
+    Passed to the message pump as `--fs-descriptor`
+    """
+    work_queue_descriptor: str
+    """The descriptor to use for sending work from the pump to the worker manager.
+    
+    Passed to the message pump as `--dispatch-fli-descriptor`
+    """
+    callback_descriptor: str
+    """The descriptor the worker should use to returning results.
+    
+    Passed to the message pump as `--callback-descriptor`
+    """
+    iteration_index: int = 1
+    """If calling the message pump repeatedly, supply an iteration index to ensure
+    that logged messages appear unique instead of apparing to be duplicated logs.
+    
+    Passed to the message pump as `--parent-iteration`
+    """
+
+    def as_command(self) -> t.List[str]:
+        """Produce CLI arguments suitable for calling subprocess.Popen that
+        to execute the msg pump.
+
+        NOTE: does NOT include the `[sys.executable, msg_pump_path, ...]`
+        portion of the necessary parameters to Popen.
+
+        :returns: The arguments of the request formatted appropriately to
+        Popen the `<project_dir>/tests/dragon/utils/msg_pump.py`"""
+        return [
+            "--dispatch-fli-descriptor",
+            self.work_queue_descriptor,
+            "--fs-descriptor",
+            self.backbone_descriptor,
+            "--parent-iteration",
+            str(self.iteration_index),
+            "--callback-descriptor",
+            self.callback_descriptor,
+        ]
+
+
+@pytest.fixture(scope="session")
+def msg_pump_factory() -> t.Callable[[MsgPumpRequest], subprocess.Popen]:
+    """A pytest fixture used to create a mock event producer capable of
+    feeding asynchronous inference requests to tests requiring them.
+
+    :returns: A function that opens a subprocess running a mock message pump
+    """
+
+    def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen:
+        """Invoke the message pump entry-point with the descriptors
+        from the request.
+
+        :param request: A request containing all parameters required to
+        invoke the message pump entrypoint
+        :returns: The Popen object for the subprocess that was started"""
+        # <smartsim_dir>/tests/dragon/utils/msg_pump.py
+        msg_pump_script = "tests/dragon/utils/msg_pump.py"
+        msg_pump_path = pathlib.Path(__file__).parent / msg_pump_script
+
+        cmd = [sys.executable, str(msg_pump_path.absolute()), *request.as_command()]
+
+        popen = subprocess.Popen(
+            args=cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        return popen
+
+    return run_message_pump
+
+
+@pytest.fixture(scope="module")
+def the_storage() -> dragon_ddict.DDict:
+    """Fixture to instantiate a dragon distributed dictionary."""
+    return dragon_util.create_ddict(1, 2, 32 * 1024**2)
+
+
+@pytest.fixture(scope="module")
+def the_worker_channel() -> DragonFLIChannel:
+    """Fixture to create a valid descriptor for a worker channel
+    that can be attached to."""
+    channel_ = create_local()
+    fli_ = FLInterface(main_ch=channel_, manager_ch=None)
+    comm_channel = DragonFLIChannel(fli_)
+    return comm_channel
+
+
+@pytest.fixture(scope="module")
+def the_backbone(
+    the_storage: t.Any, the_worker_channel: DragonFLIChannel
+) -> BackboneFeatureStore:
+    """Fixture to create a distributed dragon dictionary and wrap it
+    in a BackboneFeatureStore.
+
+    :param the_storage: The dragon storage engine to use
+    :param the_worker_channel: Pre-configured worker channel
+    """
+
+    backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True)
+    backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor
+
+    return backbone
+
+
+@pytest.fixture(scope="module")
+def backbone_descriptor(the_backbone: BackboneFeatureStore) -> str:
+    # create a shared backbone featurestore
+    return the_backbone.descriptor
diff --git a/tests/dragon/test_dragon_ddict_utils.py b/tests/dragon/test_dragon_ddict_utils.py
index d2240abc1..c8bf687ef 100644
--- a/tests/dragon/test_dragon_ddict_utils.py
+++ b/tests/dragon/test_dragon_ddict_utils.py
@@ -41,12 +41,6 @@
 logger = get_logger(__name__)
 
 
-@pytest.fixture(scope="module")
-def the_storage() -> dragon_ddict.DDict:
-    """Fixture to instantiate a dragon distributed dictionary."""
-    return dragon_util.create_ddict(1, 2, 3 * 1024**2)
-
-
 @pytest.mark.parametrize(
     "num_nodes, num_managers, mem_per_node",
     [
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index aed1b0ae4..07b2a45c1 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -39,19 +39,12 @@
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     DragonFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim.error.errors import SmartSimError
 
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
 
 
-@pytest.fixture(scope="module")
-def the_storage() -> dragon_ddict.DDict:
-    """Fixture to instantiate a dragon distributed dictionary."""
-    return create_ddict(1, 2, 4 * 1024**2)
-
-
 @pytest.mark.parametrize(
     "content",
     [
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 8421999a1..aacd47b55 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -40,7 +40,6 @@
 
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
-from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.control.request_dispatcher import (
     RequestDispatcher,
 )
@@ -55,7 +54,6 @@
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim._core.mli.infrastructure.storage.feature_store import (
     FeatureStore,
     ModelKey,
@@ -82,29 +80,6 @@
 pytestmark = pytest.mark.dragon
 
 
-@pytest.fixture(scope="module")
-def the_storage() -> DDict:
-    """Fixture to instantiate a dragon distributed dictionary."""
-    return create_ddict(1, 2, 4 * 1024**2)
-
-
-@pytest.fixture(scope="module")
-def the_worker_channel() -> DragonFLIChannel:
-    """Fixture to create a valid descriptor for a worker channel
-    that can be attached to."""
-    channel_ = create_local()
-    fli_ = FLInterface(main_ch=channel_, manager_ch=None)
-    comm_channel = DragonFLIChannel(fli_)
-    return comm_channel
-
-
-@pytest.fixture(scope="module")
-def backbone_descriptor(the_storage) -> str:
-    # create a shared backbone featurestore
-    feature_store = DragonFeatureStore(the_storage)
-    return feature_store.descriptor
-
-
 @pytest.fixture(scope="module")
 def app_feature_store(the_storage) -> FeatureStore:
     # create a standalone feature store to mimic a user application putting
diff --git a/tests/dragon/test_event_consumer.py b/tests/dragon/test_event_consumer.py
index 8c752c372..8a241bab1 100644
--- a/tests/dragon/test_event_consumer.py
+++ b/tests/dragon/test_event_consumer.py
@@ -33,7 +33,6 @@
 dragon = pytest.importorskip("dragon")
 
 from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
-from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster
 from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer
@@ -48,7 +47,6 @@
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
@@ -67,39 +65,6 @@
 pytestmark = pytest.mark.dragon
 
 
-@pytest.fixture(scope="module")
-def the_storage() -> t.Dict[str, str]:
-    """Fixture to instantiate a dragon distributed dictionary."""
-    return create_ddict(1, 2, 4 * 1024**2)
-
-
-@pytest.fixture(scope="module")
-def the_worker_channel() -> DragonFLIChannel:
-    """Fixture to create a valid descriptor for a worker channel
-    that can be attached to. Does not modify environment vars."""
-    channel_ = create_local()
-    fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
-    comm_channel = DragonFLIChannel(fli_)
-    return comm_channel
-
-
-@pytest.fixture(scope="module")
-def the_backbone(
-    the_storage: t.Any, the_worker_channel: DragonFLIChannel
-) -> BackboneFeatureStore:
-    """Fixture to create a distributed dragon dictionary and wrap it
-    in a BackboneFeatureStore.
-
-    :param the_storage: The dragon storage engine to use
-    :param the_worker_channel: Pre-configured worker channel
-    """
-
-    backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True)
-    backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor
-
-    return backbone
-
-
 def test_eventconsumer_eventpublisher_integration(
     the_backbone: t.Any, test_dir: str
 ) -> None:
diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py
index a97accd64..019dcde7a 100644
--- a/tests/dragon/test_featurestore.py
+++ b/tests/dragon/test_featurestore.py
@@ -36,15 +36,12 @@
 
 dragon = pytest.importorskip("dragon")
 
-from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
-from smartsim._core.mli.comm.channel.dragon_util import create_local
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
 )
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     time as bbtime,
 )
-from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
@@ -63,39 +60,6 @@
 pytestmark = pytest.mark.dragon
 
 
-@pytest.fixture(scope="module")
-def the_storage() -> t.Dict[str, str]:
-    """Fixture to instantiate a dragon distributed dictionary."""
-    return create_ddict(1, 2, 4 * 1024**2)
-
-
-@pytest.fixture(scope="module")
-def the_worker_channel() -> DragonFLIChannel:
-    """Fixture to create a valid descriptor for a worker channel
-    that can be attached to. Does not modify environment vars."""
-    channel_ = create_local()
-    fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None)
-    comm_channel = DragonFLIChannel(fli_)
-    return comm_channel
-
-
-@pytest.fixture(scope="module")
-def the_backbone(
-    the_storage: t.Any, the_worker_channel: DragonFLIChannel
-) -> BackboneFeatureStore:
-    """Fixture to create a distributed dragon dictionary and wrap it
-    in a BackboneFeatureStore.
-
-    :param the_storage: The dragon storage engine to use
-    :param the_worker_channel: Pre-configured worker channel
-    """
-
-    backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True)
-    backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor
-
-    return backbone
-
-
 def test_backbone_wait_for_no_keys(
     the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch
 ) -> None:
diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py
index e9fa3d5dd..23fdc55ab 100644
--- a/tests/dragon/test_featurestore_integration.py
+++ b/tests/dragon/test_featurestore_integration.py
@@ -41,10 +41,6 @@
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.dragon_util import (
-    create_ddict,
-    dragon_ddict,
-)
 
 # isort: off
 from dragon.channels import Channel
@@ -59,12 +55,6 @@
 pytestmark = pytest.mark.dragon
 
 
-@pytest.fixture(scope="module")
-def the_storage() -> dragon_ddict.DDict:
-    """Fixture to instantiate a dragon distributed dictionary."""
-    return create_ddict(1, 2, 32 * 1024**2)
-
-
 @pytest.fixture(scope="module")
 def the_worker_channel() -> DragonCommChannel:
     """Fixture to create a valid descriptor for a worker channel
@@ -74,16 +64,6 @@ def the_worker_channel() -> DragonCommChannel:
     return wmgr_channel
 
 
-@pytest.fixture(scope="module")
-def the_backbone(the_storage: t.Any) -> BackboneFeatureStore:
-    """Fixture to create a distributed dragon dictionary and wrap it
-    in a BackboneFeatureStore.
-
-    :param the_storage: The dragon storage engine to use
-    """
-    return BackboneFeatureStore(the_storage, allow_reserved_writes=True)
-
-
 @pytest.mark.parametrize(
     "num_events, batch_timeout, max_batches_expected",
     [
diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py
index fff5fac47..f84417107 100644
--- a/tests/dragon/test_protoclient.py
+++ b/tests/dragon/test_protoclient.py
@@ -42,7 +42,6 @@
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
 
@@ -60,27 +59,6 @@
 logger = get_logger(__name__)
 
 
-@pytest.fixture(scope="module")
-def the_storage() -> DDict:
-    """Fixture that creates a dragon distributed dictionary.
-
-    :returns: The attached distributed dictionary
-    """
-    return create_ddict(1, 2, 32 * 1024**2)
-
-
-@pytest.fixture(scope="module")
-def the_backbone(the_storage) -> BackboneFeatureStore:
-    """Fixture that creates a dragon backbone feature store.
-
-    :param storage_for_dragon_fs: the distributed dictionary to use in backbone
-    :returns: The backbone feature store
-    :returns: The attached `BackboneFeatureStore`
-    """
-
-    return BackboneFeatureStore(the_storage, allow_reserved_writes=True)
-
-
 @pytest.fixture(scope="module")
 def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel:
     """Fixture that creates a dragon FLI channel as a stand-in for the
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index b8b725f79..db656998a 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -35,7 +35,7 @@
 import numpy as np
 import pytest
 
-import conftest
+from . import conftest
 
 pytest.importorskip("dragon")
 
@@ -86,12 +86,6 @@
     pass
 
 
-@pytest.fixture(scope="module")
-def the_storage() -> DDict:
-    """Fixture to instantiate a dragon distributed dictionary."""
-    return create_ddict(1, 2, 4 * 1024**2)
-
-
 @pytest.mark.parametrize("num_iterations", [4])
 def test_request_dispatcher(
     msg_pump_factory: _MsgPumpFactory, num_iterations: int, the_storage: DDict

From 608d6bd75b9dc219aaca05bc6a61dbbde5a37bee Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 8 Oct 2024 14:53:40 -0500
Subject: [PATCH 37/40] remove unused import

---
 tests/dragon/conftest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/dragon/conftest.py b/tests/dragon/conftest.py
index 3084a2f38..6ce9ad148 100644
--- a/tests/dragon/conftest.py
+++ b/tests/dragon/conftest.py
@@ -41,7 +41,6 @@
 from dragon.channels import Channel
 from dragon.data.ddict.ddict import DDict
 from dragon.fli import FLInterface
-from dragon.mpbridge.queues import DragonQueue
 
 # isort: on
 

From 68d0d0c2155f3534804bdaf661ac6dcf76fcaa79 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Tue, 8 Oct 2024 19:50:12 -0500
Subject: [PATCH 38/40] fix send-multi with FLI after sender-supplied channel
 removal

---
 smartsim/_core/mli/comm/channel/dragon_fli.py | 30 +++++++++++++-----
 .../control/request_dispatcher.py             |  1 +
 smartsim/_core/mli/message_handler.py         |  1 +
 tests/dragon/conftest.py                      | 19 ++++++++----
 tests/dragon/test_request_dispatcher.py       | 18 +++++------
 tests/dragon/utils/msg_pump.py                | 31 ++++++++++---------
 6 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py
index aa9be8897..5283ba2dd 100644
--- a/smartsim/_core/mli/comm/channel/dragon_fli.py
+++ b/smartsim/_core/mli/comm/channel/dragon_fli.py
@@ -26,7 +26,6 @@
 
 # isort: off
 from dragon import fli
-import dragon.channels as dch
 
 # isort: on
 
@@ -59,9 +58,6 @@ def __init__(
 
         self._fli = fli_
         """The underlying dragon FLInterface used by this CommChannel for communications"""
-        self._channel: t.Optional["dch.Channel"] = None
-        """The underlying dragon Channel used by a sender-side DragonFLIChannel
-        to attach to the main FLI channel"""
         self._buffer_size: int = buffer_size
         """Maximum number of messages that can be buffered before sending"""
 
@@ -73,18 +69,36 @@ def send(self, value: bytes, timeout: float = 0.001) -> None:
         :raises SmartSimError: If sending message fails
         """
         try:
-            if self._channel is None:
-                self._channel = drg_util.create_local(self._buffer_size)
+            channel = drg_util.create_local(self._buffer_size)
 
-            with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh:
+            with self._fli.sendh(timeout=None, stream_channel=channel) as sendh:
                 sendh.send_bytes(value, timeout=timeout)
                 logger.debug(f"DragonFLIChannel {self.descriptor} sent message")
         except Exception as e:
-            self._channel = None
             raise SmartSimError(
                 f"Error sending via DragonFLIChannel {self.descriptor}"
             ) from e
 
+    def send_multiple(self, values: t.Sequence[bytes], timeout: float = 0.001) -> None:
+        """Send a message through the underlying communication channel.
+
+        :param values: The values to send
+        :param timeout: Maximum time to wait (in seconds) for messages to send
+        :raises SmartSimError: If sending message fails
+        """
+        try:
+            channel = drg_util.create_local(self._buffer_size)
+
+            with self._fli.sendh(timeout=None, stream_channel=channel) as sendh:
+                for value in values:
+                    sendh.send_bytes(value)
+                    logger.debug(f"DragonFLIChannel {self.descriptor} sent message")
+        except Exception as e:
+            self._channel = None
+            raise SmartSimError(
+                f"Error sending via DragonFLIChannel {self.descriptor} {e}"
+            ) from e
+
     def recv(self, timeout: float = 0.001) -> t.List[bytes]:
         """Receives message(s) through the underlying communication channel.
 
diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
index 3cc8f88da..e22a2c8f6 100644
--- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
+++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py
@@ -371,6 +371,7 @@ def _on_iteration(self) -> None:
                     None,
                 )
 
+            logger.debug(f"Dispatcher is processing {len(bytes_list)} messages")
             request_bytes = bytes_list[0]
             tensor_bytes_list = bytes_list[1:]
             self._perf_timer.start_timings()
diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py
index 2511e9d25..e3d46a7ab 100644
--- a/smartsim/_core/mli/message_handler.py
+++ b/smartsim/_core/mli/message_handler.py
@@ -38,6 +38,7 @@ class MessageHandler:
     """Utility methods for transforming capnproto messages to and from
     internal representations.
     """
+
     @staticmethod
     def build_tensor_descriptor(
         order: "tensor_capnp.Order",
diff --git a/tests/dragon/conftest.py b/tests/dragon/conftest.py
index 6ce9ad148..6903f7b9d 100644
--- a/tests/dragon/conftest.py
+++ b/tests/dragon/conftest.py
@@ -50,9 +50,11 @@
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
-    DragonFeatureStore,
-)
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+msg_pump_path = pathlib.Path(__file__).parent / "utils" / "msg_pump.py"
+
 
 class MsgPumpRequest(t.NamedTuple):
     """Fields required for starting a simulated inference request producer."""
@@ -116,17 +118,22 @@ def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen:
         :param request: A request containing all parameters required to
         invoke the message pump entrypoint
         :returns: The Popen object for the subprocess that was started"""
-        # <smartsim_dir>/tests/dragon/utils/msg_pump.py
-        msg_pump_script = "tests/dragon/utils/msg_pump.py"
-        msg_pump_path = pathlib.Path(__file__).parent / msg_pump_script
+        assert request.backbone_descriptor
+        assert request.callback_descriptor
+        assert request.work_queue_descriptor
 
+        # <smartsim_dir>/tests/dragon/utils/msg_pump.py
         cmd = [sys.executable, str(msg_pump_path.absolute()), *request.as_command()]
+        logger.debug(f"Executing msg_pump with command: {cmd}")
 
         popen = subprocess.Popen(
             args=cmd,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
         )
+
+        assert popen is not None
+        assert popen.returncode is None
         return popen
 
     return run_message_pump
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index db656998a..a6c4ac5dd 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -73,7 +73,6 @@
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
-mock_msg_pump_path = pathlib.Path(__file__).parent / "utils" / "msg_pump.py"
 _MsgPumpFactory = t.Callable[[conftest.MsgPumpRequest], sp.Popen]
 
 # The tests in this file belong to the dragon group
@@ -129,8 +128,8 @@ def test_request_dispatcher(
         )
 
     request_dispatcher._on_start()
-    pump_processes: t.List[sp.Popen] = []
 
+    # put some messages into the work queue for the dispatcher to pickup
     for i in range(num_iterations):
         batch: t.Optional[RequestBatch] = None
         mem_allocs = []
@@ -149,18 +148,22 @@ def test_request_dispatcher(
         )
 
         msg_pump = msg_pump_factory(request)
-        pump_processes.append(msg_pump)
+
+        assert msg_pump is not None, "Msg Pump Process Creation Failed"
+        assert msg_pump.wait() == 0
 
         time.sleep(1)
 
-        for _ in range(200):
+        for i in range(15):
             try:
                 request_dispatcher._on_iteration()
                 batch = request_dispatcher.task_queue.get(timeout=0.1)
                 break
             except Empty:
+                logger.warning(f"Task queue is empty on iteration {i}")
                 continue
             except Exception as exc:
+                logger.error(f"Task queue exception on iteration {i}")
                 raise exc
 
         assert batch is not None
@@ -219,13 +222,6 @@ def test_request_dispatcher(
         assert model_key not in request_dispatcher._active_queues
         assert model_key not in request_dispatcher._queues
 
-        msg_pump.wait()
-
-    for msg_pump in pump_processes:
-        if msg_pump.returncode is not None:
-            continue
-        msg_pump.terminate()
-
     # Try to remove the dispatcher and free the memory
     del request_dispatcher
     gc.collect()
diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py
index 835bccd2b..4b9833b91 100644
--- a/tests/dragon/utils/msg_pump.py
+++ b/tests/dragon/utils/msg_pump.py
@@ -27,7 +27,7 @@
 import io
 import logging
 import pathlib
-import time
+import sys
 import typing as t
 
 import pytest
@@ -44,7 +44,6 @@
 
 # isort: on
 
-from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel
 from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.storage.backbone_feature_store import (
     BackboneFeatureStore,
@@ -124,6 +123,8 @@ def mock_messages(
     feature_store = BackboneFeatureStore.from_descriptor(fs_descriptor)
     request_dispatcher_queue = DragonFLIChannel.from_descriptor(dispatch_fli_descriptor)
 
+    feature_store[model_key] = load_model()
+
     for iteration_number in range(2):
         logged_iteration = offset + iteration_number
         logger.debug(f"Sending mock message {logged_iteration}")
@@ -164,13 +165,9 @@ def mock_messages(
         logger.info(
             f"Retrieving {iteration_number} from callback channel: {callback_descriptor}"
         )
-        callback_channel = DragonCommChannel.from_descriptor(callback_descriptor)
 
-        # Results will be empty. The test pulls messages off the queue before they
-        # can be serviced by a worker. Just ensure the callback channel works.
-        results = callback_channel.recv(timeout=0.1)
-        logger.debug(f"Received mock message results on callback channel: {results}")
-        time.sleep(1)
+        # send the header & body together so they arrive together
+        request_dispatcher_queue.send_multiple([request_bytes, tensor.tobytes()])
 
 
 if __name__ == "__main__":
@@ -185,9 +182,15 @@ def mock_messages(
 
     args = args.parse_args()
 
-    mock_messages(
-        args.dispatch_fli_descriptor,
-        args.fs_descriptor,
-        args.parent_iteration,
-        args.callback_descriptor,
-    )
+    try:
+        mock_messages(
+            args.dispatch_fli_descriptor,
+            args.fs_descriptor,
+            args.parent_iteration,
+            args.callback_descriptor,
+        )
+    except Exception as ex:
+        logger.exception("The message pump did not execute properly")
+        sys.exit(100)
+
+    sys.exit(0)

From c28870f19e31c28ea1803ac8fc40a114ea502e2a Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Wed, 9 Oct 2024 16:44:50 -0500
Subject: [PATCH 39/40] Update dispatch tests to use dragon processes

---
 smartsim/_core/mli/comm/channel/dragon_fli.py |  16 ++-
 tests/dragon/conftest.py                      | 123 ++++++------------
 tests/dragon/test_request_dispatcher.py       |  44 ++++---
 tests/dragon/utils/msg_pump.py                |  81 ++++++++----
 4 files changed, 130 insertions(+), 134 deletions(-)

diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py
index 5283ba2dd..0b462af54 100644
--- a/smartsim/_core/mli/comm/channel/dragon_fli.py
+++ b/smartsim/_core/mli/comm/channel/dragon_fli.py
@@ -26,6 +26,7 @@
 
 # isort: off
 from dragon import fli
+from dragon.channels import Channel
 
 # isort: on
 
@@ -56,6 +57,10 @@ def __init__(
         descriptor = drg_util.channel_to_descriptor(fli_)
         super().__init__(descriptor)
 
+        self._channel: t.Optional["Channel"] = None
+        """The underlying dragon Channel used by a sender-side DragonFLIChannel
+        to attach to the main FLI channel"""
+
         self._fli = fli_
         """The underlying dragon FLInterface used by this CommChannel for communications"""
         self._buffer_size: int = buffer_size
@@ -79,7 +84,11 @@ def send(self, value: bytes, timeout: float = 0.001) -> None:
                 f"Error sending via DragonFLIChannel {self.descriptor}"
             ) from e
 
-    def send_multiple(self, values: t.Sequence[bytes], timeout: float = 0.001) -> None:
+    def send_multiple(
+        self,
+        values: t.Sequence[bytes],
+        timeout: float = 0.001,
+    ) -> None:
         """Send a message through the underlying communication channel.
 
         :param values: The values to send
@@ -87,9 +96,10 @@ def send_multiple(self, values: t.Sequence[bytes], timeout: float = 0.001) -> No
         :raises SmartSimError: If sending message fails
         """
         try:
-            channel = drg_util.create_local(self._buffer_size)
+            if self._channel is None:
+                self._channel = drg_util.create_local(self._buffer_size)
 
-            with self._fli.sendh(timeout=None, stream_channel=channel) as sendh:
+            with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh:
                 for value in values:
                     sendh.send_bytes(value)
                     logger.debug(f"DragonFLIChannel {self.descriptor} sent message")
diff --git a/tests/dragon/conftest.py b/tests/dragon/conftest.py
index 6903f7b9d..d54270017 100644
--- a/tests/dragon/conftest.py
+++ b/tests/dragon/conftest.py
@@ -26,7 +26,9 @@
 
 from __future__ import annotations
 
+import os
 import pathlib
+import socket
 import subprocess
 import sys
 import typing as t
@@ -37,9 +39,10 @@
 
 # isort: off
 import dragon.data.ddict.ddict as dragon_ddict
+import dragon.infrastructure.policy as dragon_policy
+import dragon.infrastructure.process_desc as dragon_process_desc
+import dragon.native.process as dragon_process
 
-from dragon.channels import Channel
-from dragon.data.ddict.ddict import DDict
 from dragon.fli import FLInterface
 
 # isort: on
@@ -53,90 +56,6 @@
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
-msg_pump_path = pathlib.Path(__file__).parent / "utils" / "msg_pump.py"
-
-
-class MsgPumpRequest(t.NamedTuple):
-    """Fields required for starting a simulated inference request producer."""
-
-    backbone_descriptor: str
-    """The descriptor to use when connecting the message pump to a 
-    backbone featurestore.
-    
-    Passed to the message pump as `--fs-descriptor`
-    """
-    work_queue_descriptor: str
-    """The descriptor to use for sending work from the pump to the worker manager.
-    
-    Passed to the message pump as `--dispatch-fli-descriptor`
-    """
-    callback_descriptor: str
-    """The descriptor the worker should use to returning results.
-    
-    Passed to the message pump as `--callback-descriptor`
-    """
-    iteration_index: int = 1
-    """If calling the message pump repeatedly, supply an iteration index to ensure
-    that logged messages appear unique instead of apparing to be duplicated logs.
-    
-    Passed to the message pump as `--parent-iteration`
-    """
-
-    def as_command(self) -> t.List[str]:
-        """Produce CLI arguments suitable for calling subprocess.Popen that
-        to execute the msg pump.
-
-        NOTE: does NOT include the `[sys.executable, msg_pump_path, ...]`
-        portion of the necessary parameters to Popen.
-
-        :returns: The arguments of the request formatted appropriately to
-        Popen the `<project_dir>/tests/dragon/utils/msg_pump.py`"""
-        return [
-            "--dispatch-fli-descriptor",
-            self.work_queue_descriptor,
-            "--fs-descriptor",
-            self.backbone_descriptor,
-            "--parent-iteration",
-            str(self.iteration_index),
-            "--callback-descriptor",
-            self.callback_descriptor,
-        ]
-
-
-@pytest.fixture(scope="session")
-def msg_pump_factory() -> t.Callable[[MsgPumpRequest], subprocess.Popen]:
-    """A pytest fixture used to create a mock event producer capable of
-    feeding asynchronous inference requests to tests requiring them.
-
-    :returns: A function that opens a subprocess running a mock message pump
-    """
-
-    def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen:
-        """Invoke the message pump entry-point with the descriptors
-        from the request.
-
-        :param request: A request containing all parameters required to
-        invoke the message pump entrypoint
-        :returns: The Popen object for the subprocess that was started"""
-        assert request.backbone_descriptor
-        assert request.callback_descriptor
-        assert request.work_queue_descriptor
-
-        # <smartsim_dir>/tests/dragon/utils/msg_pump.py
-        cmd = [sys.executable, str(msg_pump_path.absolute()), *request.as_command()]
-        logger.debug(f"Executing msg_pump with command: {cmd}")
-
-        popen = subprocess.Popen(
-            args=cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-
-        assert popen is not None
-        assert popen.returncode is None
-        return popen
-
-    return run_message_pump
 
 
 @pytest.fixture(scope="module")
@@ -176,3 +95,35 @@ def the_backbone(
 def backbone_descriptor(the_backbone: BackboneFeatureStore) -> str:
     # create a shared backbone featurestore
     return the_backbone.descriptor
+
+
+def function_as_dragon_proc(
+    entrypoint_fn: t.Callable[[t.Any], None],
+    args: t.List[t.Any],
+    cpu_affinity: t.List[int],
+    gpu_affinity: t.List[int],
+) -> dragon_process.Process:
+    """Execute a function as an independent dragon process.
+
+    :param entrypoint_fn: The function to execute
+    :param args: The arguments for the entrypoint function
+    :param cpu_affinity: The cpu affinity for the process
+    :param gpu_affinity: The gpu affinity for the process
+    :returns: The dragon process handle
+    """
+    options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
+    local_policy = dragon_policy.Policy(
+        placement=dragon_policy.Policy.Placement.HOST_NAME,
+        host_name=socket.gethostname(),
+        cpu_affinity=cpu_affinity,
+        gpu_affinity=gpu_affinity,
+    )
+    return dragon_process.Process(
+        target=entrypoint_fn,
+        args=args,
+        cwd=os.getcwd(),
+        policy=local_policy,
+        options=options,
+        stderr=dragon_process.Popen.STDOUT,
+        stdout=dragon_process.Popen.STDOUT,
+    )
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
index a6c4ac5dd..70d73e243 100644
--- a/tests/dragon/test_request_dispatcher.py
+++ b/tests/dragon/test_request_dispatcher.py
@@ -26,7 +26,6 @@
 
 import gc
 import os
-import pathlib
 import subprocess as sp
 import time
 import typing as t
@@ -36,6 +35,7 @@
 import pytest
 
 from . import conftest
+from .utils import msg_pump
 
 pytest.importorskip("dragon")
 
@@ -68,12 +68,10 @@
 from smartsim._core.mli.infrastructure.storage.dragon_feature_store import (
     DragonFeatureStore,
 )
-from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
-_MsgPumpFactory = t.Callable[[conftest.MsgPumpRequest], sp.Popen]
 
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
@@ -87,7 +85,9 @@
 
 @pytest.mark.parametrize("num_iterations", [4])
 def test_request_dispatcher(
-    msg_pump_factory: _MsgPumpFactory, num_iterations: int, the_storage: DDict
+    num_iterations: int,
+    the_storage: DDict,
+    test_dir: str,
 ) -> None:
     """Test the request dispatcher batching and queueing system
 
@@ -113,7 +113,7 @@ def test_request_dispatcher(
     )
 
     request_dispatcher = RequestDispatcher(
-        batch_timeout=0,
+        batch_timeout=1000,
         batch_size=2,
         config_loader=config_loader,
         worker_type=TorchWorker,
@@ -130,6 +130,8 @@ def test_request_dispatcher(
     request_dispatcher._on_start()
 
     # put some messages into the work queue for the dispatcher to pickup
+    channels = []
+    processes = []
     for i in range(num_iterations):
         batch: t.Optional[RequestBatch] = None
         mem_allocs = []
@@ -139,27 +141,31 @@ def test_request_dispatcher(
         # down when mock_messages terms but before the final response message is sent
 
         callback_channel = DragonCommChannel.from_local()
-
-        request = conftest.MsgPumpRequest(
-            backbone_fs.descriptor,
-            worker_queue.descriptor,
-            callback_channel.descriptor,
-            i,
+        channels.append(callback_channel)
+
+        process = conftest.function_as_dragon_proc(
+            msg_pump.mock_messages,
+            [
+                worker_queue.descriptor,
+                backbone_fs.descriptor,
+                i,
+                callback_channel.descriptor,
+            ],
+            [],
+            [],
         )
+        processes.append(process)
+        process.start()
+        assert process.returncode is None, "The message pump failed to start"
 
-        msg_pump = msg_pump_factory(request)
-
-        assert msg_pump is not None, "Msg Pump Process Creation Failed"
-        assert msg_pump.wait() == 0
-
-        time.sleep(1)
-
+        # give dragon some time to populate the message queues
         for i in range(15):
             try:
                 request_dispatcher._on_iteration()
-                batch = request_dispatcher.task_queue.get(timeout=0.1)
+                batch = request_dispatcher.task_queue.get(timeout=1.0)
                 break
             except Empty:
+                time.sleep(2)
                 logger.warning(f"Task queue is empty on iteration {i}")
                 continue
             except Exception as exc:
diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py
index 4b9833b91..8d69e57c6 100644
--- a/tests/dragon/utils/msg_pump.py
+++ b/tests/dragon/utils/msg_pump.py
@@ -28,6 +28,7 @@
 import logging
 import pathlib
 import sys
+import time
 import typing as t
 
 import pytest
@@ -109,13 +110,13 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
     return model_path
 
 
-def mock_messages(
+def _mock_messages(
     dispatch_fli_descriptor: str,
     fs_descriptor: str,
     parent_iteration: int,
     callback_descriptor: str,
 ) -> None:
-    """Mock event producer for triggering the inference pipeline"""
+    """Mock event producer for triggering the inference pipeline."""
     model_key = "mini-model"
     # mock_message sends 2 messages, so we offset by 2 * (# of iterations in caller)
     offset = 2 * parent_iteration
@@ -131,8 +132,6 @@ def mock_messages(
 
         output_key = f"output-{iteration_number}"
 
-        feature_store[model_key] = load_model()
-
         tensor = (
             (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32)
         ).numpy()
@@ -156,18 +155,53 @@ def mock_messages(
 
         logger.info(f"Sending request {iteration_number} to request_dispatcher_queue")
         request_bytes = MessageHandler.serialize_request(request)
-        with request_dispatcher_queue._fli.sendh(
-            timeout=None, stream_channel=request_dispatcher_queue._channel
-        ) as sendh:
-            sendh.send_bytes(request_bytes)
-            sendh.send_bytes(tensor.tobytes())
-
-        logger.info(
-            f"Retrieving {iteration_number} from callback channel: {callback_descriptor}"
-        )
+
+        logger.info("Sending msg_envelope")
+
+        # cuid = request_dispatcher_queue._channel.cuid
+        # logger.info(f"\tInternal cuid: {cuid}")
 
         # send the header & body together so they arrive together
-        request_dispatcher_queue.send_multiple([request_bytes, tensor.tobytes()])
+        try:
+            request_dispatcher_queue.send_multiple([request_bytes, tensor.tobytes()])
+            logger.info(f"\tenvelope 0: {request_bytes[:5]}...")
+            logger.info(f"\tenvelope 1: {tensor.tobytes()[:5]}...")
+        except Exception as ex:
+            logger.exception("Unable to send request envelope")
+
+    logger.info("All messages sent")
+
+    # keep the process alive for an extra 15 seconds to let the processor
+    # have access to the channels before they're destroyed
+    for _ in range(15):
+        time.sleep(1)
+
+
+def mock_messages(
+    dispatch_fli_descriptor: str,
+    fs_descriptor: str,
+    parent_iteration: int,
+    callback_descriptor: str,
+) -> int:
+    """Mock event producer for triggering the inference pipeline. Used
+    when starting using multiprocessing."""
+    logger.info(f"{dispatch_fli_descriptor=}")
+    logger.info(f"{fs_descriptor=}")
+    logger.info(f"{parent_iteration=}")
+    logger.info(f"{callback_descriptor=}")
+
+    try:
+        return _mock_messages(
+            dispatch_fli_descriptor,
+            fs_descriptor,
+            parent_iteration,
+            callback_descriptor,
+        )
+    except Exception as ex:
+        logger.exception()
+        return 1
+
+    return 0
 
 
 if __name__ == "__main__":
@@ -182,15 +216,10 @@ def mock_messages(
 
     args = args.parse_args()
 
-    try:
-        mock_messages(
-            args.dispatch_fli_descriptor,
-            args.fs_descriptor,
-            args.parent_iteration,
-            args.callback_descriptor,
-        )
-    except Exception as ex:
-        logger.exception("The message pump did not execute properly")
-        sys.exit(100)
-
-    sys.exit(0)
+    return_code = mock_messages(
+        args.dispatch_fli_descriptor,
+        args.fs_descriptor,
+        args.parent_iteration,
+        args.callback_descriptor,
+    )
+    sys.exit(return_code)

From 78d5598b2f7021cb94a25989180a59fd709b0c95 Mon Sep 17 00:00:00 2001
From: Chris McBride <christopher.mcbride@gmail.com>
Date: Wed, 9 Oct 2024 16:47:27 -0500
Subject: [PATCH 40/40] Use cached FLI channel on single-send

---
 smartsim/_core/mli/comm/channel/dragon_fli.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py
index 0b462af54..5fb0790a8 100644
--- a/smartsim/_core/mli/comm/channel/dragon_fli.py
+++ b/smartsim/_core/mli/comm/channel/dragon_fli.py
@@ -74,12 +74,14 @@ def send(self, value: bytes, timeout: float = 0.001) -> None:
         :raises SmartSimError: If sending message fails
         """
         try:
-            channel = drg_util.create_local(self._buffer_size)
+            if self._channel is None:
+                self._channel = drg_util.create_local(self._buffer_size)
 
-            with self._fli.sendh(timeout=None, stream_channel=channel) as sendh:
+            with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh:
                 sendh.send_bytes(value, timeout=timeout)
                 logger.debug(f"DragonFLIChannel {self.descriptor} sent message")
         except Exception as e:
+            self._channel = None
             raise SmartSimError(
                 f"Error sending via DragonFLIChannel {self.descriptor}"
             ) from e