From f85db2a584a146f54801dc71107369202c93f1cf Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:54:43 -0500
Subject: [PATCH 01/49] Add file system descriptor to tensor & model keys

---
 doc/changelog.md                              |   1 +
 .../infrastructure/control/workermanager.py   | 212 +++++++-----------
 .../mli/infrastructure/environmentloader.py   |  36 ++-
 .../storage/dragonfeaturestore.py             |   7 +
 .../infrastructure/storage/featurestore.py    |  17 ++
 .../_core/mli/infrastructure/worker/worker.py | 149 +++++++++---
 smartsim/_core/mli/message_handler.py         |  11 +-
 .../mli_schemas/data/data_references.capnp    |   2 +
 .../data/data_references_capnp.pyi            |   2 +
 tests/dragon/{utils => }/featurestore.py      |  14 ++
 tests/dragon/test_environment_loader.py       |  89 +++++---
 tests/mli/featurestore.py                     |  14 ++
 .../mli/test_core_machine_learning_worker.py  |  83 ++++---
 tests/mli/test_torch_worker.py                |   4 +-
 tests/mli/test_worker_manager.py              |  13 +-
 .../test_build_model_key.py                   |   7 +-
 .../test_build_tensor_key.py                  |   6 +-
 .../test_output_descriptor.py                 |   3 +-
 tests/test_message_handler/test_request.py    |  12 +-
 tests/test_message_handler/test_response.py   |   5 +-
 20 files changed, 432 insertions(+), 255 deletions(-)
 rename tests/dragon/{utils => }/featurestore.py (92%)

diff --git a/doc/changelog.md b/doc/changelog.md
index 0ada4e4ec..809ad5e8e 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -13,6 +13,7 @@ Jump to:
 
 Description
 
+- Enable dynamic feature store selection
 - Fix dragon package installation bug
 - Adjust schemas for better performance
 - Add TorchWorker first implementation and mock inference app example
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 27f5bfc97..79ffc6dbd 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -25,18 +25,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import sys
-
-# isort: off
-import dragon
-from dragon import fli
-
-# isort: on
-
 import time
 import typing as t
 
-import numpy as np
-
 from .....error import SmartSimError
 from .....log import get_logger
 from ....entrypoints.service import Service
@@ -63,96 +54,23 @@
 logger = get_logger(__name__)
 
 
-def deserialize_message(
-    data_blob: bytes,
-    channel_type: t.Type[CommChannelBase],
-    device: t.Literal["cpu", "gpu"],
-) -> InferenceRequest:
-    """Deserialize a message from a byte stream into an InferenceRequest
-    :param data_blob: The byte stream to deserialize"""
-    # todo: consider moving to XxxCore and only making
-    # workers implement the inputs and model conversion?
-
-    # alternatively, consider passing the capnproto models
-    # to this method instead of the data_blob...
-
-    # something is definitely wrong here... client shouldn't have to touch
-    # callback (or batch size)
-
-    request = MessageHandler.deserialize_request(data_blob)
-    # return request
-    model_key: t.Optional[str] = None
-    model_bytes: t.Optional[Model] = None
-
-    if request.model.which() == "key":
-        model_key = request.model.key.key
-    elif request.model.which() == "data":
-        model_bytes = request.model.data
-
-    callback_key = request.replyChannel.descriptor
-
-    # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel`
-    comm_channel = channel_type(callback_key)
-    # comm_channel = DragonCommChannel(request.replyChannel)
-
-    input_keys: t.Optional[t.List[str]] = None
-    input_bytes: t.Optional[t.List[bytes]] = None
-
-    output_keys: t.Optional[t.List[str]] = None
-
-    input_meta: t.Optional[t.List[TensorDescriptor]] = None
-
-    if request.input.which() == "keys":
-        input_keys = [input_key.key for input_key in request.input.keys]
-    elif request.input.which() == "descriptors":
-        input_meta = request.input.descriptors  # type: ignore
-
-    if request.output:
-        output_keys = [tensor_key.key for tensor_key in request.output]
-
-    inference_request = InferenceRequest(
-        model_key=model_key,
-        callback=comm_channel,
-        raw_inputs=input_bytes,
-        input_keys=input_keys,
-        input_meta=input_meta,
-        output_keys=output_keys,
-        raw_model=model_bytes,
-        batch_size=0,
-    )
-    return inference_request
-
-
-def build_failure_reply(status: "Status", message: str) -> ResponseBuilder:
+def build_failure_reply(status: "StatusEnum", message: str) -> Response:
+    """Build a response indicating a failure occurred
+    :param status: The status of the response
+    :param message: The error message to include in the response"""
     return MessageHandler.build_response(
-        status=status,
-        message=message,
-        result=[],
+        status=status,  # todo: need to indicate correct status
+        message=message,  # todo: decide what these will be
+        result=None,
         custom_attributes=None,
     )
 
 
-def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]:
-    prepared_outputs: t.List[t.Any] = []
-    if reply.output_keys:
-        for key in reply.output_keys:
-            if not key:
-                continue
-            msg_key = MessageHandler.build_tensor_key(key)
-            prepared_outputs.append(msg_key)
-    elif reply.outputs:
-        for _ in reply.outputs:
-            msg_tensor_desc = MessageHandler.build_tensor_descriptor(
-                "c",
-                "float32",
-                [1],
-            )
-            prepared_outputs.append(msg_tensor_desc)
-    return prepared_outputs
-
-
-def build_reply(reply: InferenceReply) -> ResponseBuilder:
-    results = prepare_outputs(reply)
+def build_reply(worker: MachineLearningWorkerBase, reply: InferenceReply) -> Response:
+    """Builds a response for a successful inference request
+    :param worker: A worker to process the reply with
+    :param reply: The internal representation of the reply"""
+    results = worker.prepare_outputs(reply)
 
     return MessageHandler.build_response(
         status=reply.status_enum,
@@ -210,10 +128,6 @@ def __init__(
 
         self._task_queue: t.Optional[CommChannelBase] = config_loader.get_queue()
         """the queue the manager monitors for new tasks"""
-        self._feature_store: t.Optional[FeatureStore] = (
-            config_loader.get_feature_store()
-        )
-        """a feature store to retrieve models from"""
         self._worker = worker
         """The ML Worker implementation"""
         self._comm_channel_type = comm_channel_type
@@ -222,37 +136,68 @@ def __init__(
         """Device on which workers need to run"""
         self._cached_models: dict[str, t.Any] = {}
         """Dictionary of previously loaded models"""
+        self._feature_stores = config_loader.get_feature_stores()
+        """A collection of attached feature stores"""
+
+    def _check_feature_stores(self, request: InferenceRequest) -> bool:
+        """Ensures that all feature stores required by the request are available
+        :param request: The request to validate"""
+        # collect all feature stores required by the request
+        fs_model = {request.model_key.descriptor}
+        fs_inputs = {key.descriptor for key in request.input_keys}
+        fs_outputs = {key.descriptor for key in request.output_keys}
+
+        # identify which feature stores are requested and unknown
+        fs_desired = fs_model + fs_inputs + fs_outputs
+        fs_actual = {key for key in self._feature_stores}
+        fs_missing = fs_desired - fs_actual
+
+        # exit if all desired feature stores are not available
+        if fs_missing:
+            logger.error(f"Missing feature store(s): {fs_missing}")
+            return False
 
-    def _validate_request(self, request: InferenceRequest) -> bool:
-        """Ensure the request can be processed.
-        :param request: The request to validate
-        :return: True if the request is valid, False otherwise"""
-        if not self._feature_store:
-            if request.model_key:
-                logger.error("Unable to load model by key without feature store")
-                return False
+        return True
 
-            if request.input_keys:
-                logger.error("Unable to load inputs by key without feature store")
-                return False
+    def _check_model(self, request: InferenceRequest) -> bool:
+        """Ensure that a model is available for the request
+        :param request: The request to validate"""
+        if request.model_key or request.raw_model:
+            return True
 
-            if request.output_keys:
-                logger.error("Unable to persist outputs by key without feature store")
-                return False
+        logger.error("Unable to continue without model bytes or feature store key")
+        return False
 
-        if not request.model_key and not request.raw_model:
-            logger.error("Unable to continue without model bytes or feature store key")
-            return False
+    def _check_inputs(self, request: InferenceRequest) -> bool:
+        """Ensure that inputs are available for the request
+        :param request: The request to validate"""
+        if request.input_keys or request.raw_inputs:
+            return True
 
-        if not request.input_keys and not request.raw_inputs:
-            logger.error("Unable to continue without input bytes or feature store keys")
-            return False
+        logger.error("Unable to continue without input bytes or feature store keys")
+        return False
 
-        if request.callback is None:
-            logger.error("No callback channel provided in request")
-            return False
+    def _check_callback(self, request: InferenceRequest) -> bool:
+        """Ensure that a callback channel is available for the request
+        :param request: The request to validate"""
+        if request.callback is not None:
+            return True
 
-        return True
+        logger.error("No callback channel provided in request")
+        return False
+
+    def _validate_request(self, request: InferenceRequest) -> bool:
+        """Ensure the request can be processed.
+        :param request: The request to validate
+        :return: True if the request is valid, False otherwise"""
+        checks = [
+            self._check_feature_stores(request),
+            self._check_model(request),
+            self._check_inputs(request),
+            self._check_callback(request),
+        ]
+
+        return all(checks)
 
     def _on_iteration(self) -> None:
         """Executes calls to the machine learning worker implementation to complete
@@ -279,8 +224,8 @@ def _on_iteration(self) -> None:
         tensor_bytes_list = bytes_list[1:]
 
         interm = time.perf_counter()  # timing
-        request = deserialize_message(
-            request_bytes, self._comm_channel_type, self._device
+        request = self._worker.deserialize_message(
+            request_bytes, self._comm_channel_type
         )
 
         if request.input_meta and tensor_bytes_list:
@@ -302,10 +247,12 @@ def _on_iteration(self) -> None:
                     "Could not find model key or model.",
                 )
                 return
-            if request.model_key in self._cached_models:
+            if request.model_key.key in self._cached_models:
                 timings.append(time.perf_counter() - interm)  # timing
                 interm = time.perf_counter()  # timing
-                model_result = LoadModelResult(self._cached_models[request.model_key])
+                model_result = LoadModelResult(
+                    self._cached_models[request.model_key.key]
+                )
 
             else:
                 timings.append(time.perf_counter() - interm)  # timing
@@ -328,7 +275,7 @@ def _on_iteration(self) -> None:
                         fetch_result=fetch_model_result,
                         device=self._device,
                     )
-                    self._cached_models[request.model_key] = model_result.model
+                    self._cached_models[request.model_key.key] = model_result.model
                 except Exception as e:
                     exception_handler(
                         e, request.callback, "Failed while loading the model."
@@ -407,9 +354,7 @@ def _on_iteration(self) -> None:
         if request.output_keys:
             try:
                 reply.output_keys = self._worker.place_output(
-                    request,
-                    transformed_output,
-                    self._feature_store,
+                    request, transformed_output, self._feature_stores
                 )
             except Exception as e:
                 exception_handler(
@@ -425,9 +370,10 @@ def _on_iteration(self) -> None:
         if reply.outputs is None or not reply.outputs:
             response = build_failure_reply("fail", "Outputs not found.")
         else:
-            reply.status_enum = "complete"
-            reply.message = "Success"
-            response = build_reply(reply)
+            if reply.outputs is None or not reply.outputs:
+                response = build_failure_reply("fail", "no-results")
+
+            response = build_reply(self._worker, reply)
 
         timings.append(time.perf_counter() - interm)  # timing
         interm = time.perf_counter()  # timing
diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index 9f6770623..4502ec2fc 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -33,6 +33,10 @@
 
 from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
 
 
 class EnvironmentConfigLoader:
@@ -47,15 +51,35 @@ def __init__(self) -> None:
         )
         self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None)
         self.feature_store: t.Optional[FeatureStore] = None
+        self.feature_stores: t.Optional[t.Dict[FeatureStore]] = None
         self.queue: t.Optional[DragonFLIChannel] = None
 
-    def get_feature_store(self) -> t.Optional[FeatureStore]:
-        """Loads the Feature Store previously set in SSFeatureStore"""
-        if self._feature_store_descriptor is not None:
-            self.feature_store = pickle.loads(
-                base64.b64decode(self._feature_store_descriptor)
+    def _load_feature_store(self, env_var: str) -> FeatureStore:
+        """Load a feature store from a descriptor
+        :param descriptor: The descriptor of the feature store
+        :returns: The hydrated feature store"""
+        logger.debug(f"Loading feature store from env: {env_var}")
+
+        value = os.getenv(env_var)
+        if not value:
+            raise SmartSimError(f"Empty feature store descriptor in environment: {env_var}")
+
+        try:
+            return pickle.loads(base64.b64decode(value))
+        except:
+            raise SmartSimError(
+                f"Invalid feature store descriptor in environment: {env_var}"
             )
-        return self.feature_store
+
+    def get_feature_stores(self) -> t.Dict[str, FeatureStore]:
+        """Loads multiple Feature Stores by scanning environment for variables
+        prefixed with `SSFeatureStore`"""
+        prefix = "SSFeatureStore"
+        if self.feature_stores is None:
+            env_vars = [var for var in os.environ if var.startswith(prefix)]
+            stores = [self._load_feature_store(var) for var in env_vars]
+            self.feature_stores = {fs.descriptor: fs for fs in stores}
+        return self.feature_stores
 
     def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]:
         """Returns the Queue previously set in SSQueue"""
diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
index af592ed0a..d5344198a 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
@@ -69,3 +69,10 @@ def __contains__(self, key: str) -> bool:
         Return `True` if the key is found, `False` otherwise
         :param key: Unique key of an item to retrieve from the feature store"""
         return key in self._storage
+
+    @property
+    def descriptor(self) -> str:
+        """Return a unique identifier enabling a client to connect to
+        the feature store
+        :returns: A descriptor encoded as a string"""
+        return str(self._storage.serialize())
diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py
index 553e13b10..5291b2db3 100644
--- a/smartsim/_core/mli/infrastructure/storage/featurestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py
@@ -28,6 +28,16 @@
 from abc import ABC, abstractmethod
 
 
+class FeatureStoreKey:
+    """A key-value pair enabling retrieval of an item in a feature store"""
+
+    def __init__(self, key: str, descriptor: str) -> None:
+        self.key = key
+        """The unique key of an item in the feature store"""
+        self.descriptor = descriptor
+        """The unique identifier of the feature store containing the key"""
+
+
 class FeatureStore(ABC):
     """Abstract base class providing the common interface for retrieving
     values from a feature store implementation"""
@@ -48,3 +58,10 @@ def __contains__(self, key: str) -> bool:
         """Membership operator to test for a key existing within the feature store.
         Return `True` if the key is found, `False` otherwise
         :param key: Unique key of an item to retrieve from the feature store"""
+
+    @property
+    @abstractmethod
+    def descriptor(self) -> str:
+        """Return a unique identifier enabling a client to connect to
+        the feature store
+        :returns: A descriptor encoded as a string"""
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index bb8d82231..f7b053245 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -27,10 +27,13 @@
 import typing as t
 from abc import ABC, abstractmethod
 
+import numpy as np
+
 from .....error import SmartSimError
 from .....log import get_logger
 from ...comm.channel.channel import CommChannelBase
-from ...infrastructure.storage.featurestore import FeatureStore
+from ...infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey
+from ...message_handler import MessageHandler
 from ...mli_schemas.model.model_capnp import Model
 
 if t.TYPE_CHECKING:
@@ -44,26 +47,32 @@ class InferenceRequest:
 
     def __init__(
         self,
-        model_key: t.Optional[str] = None,
+        model_key: t.Optional[FeatureStoreKey] = None,
         callback: t.Optional[CommChannelBase] = None,
         raw_inputs: t.Optional[t.List[bytes]] = None,
-        # todo: copying byte array is likely to create a copy of the data in
-        # capnproto and will be a performance issue later
-        input_keys: t.Optional[t.List[str]] = None,
+        input_keys: t.Optional[t.List[FeatureStoreKey]] = None,
         input_meta: t.Optional[t.List[t.Any]] = None,
-        output_keys: t.Optional[t.List[str]] = None,
+        output_keys: t.Optional[t.List[FeatureStoreKey]] = None,
         raw_model: t.Optional[Model] = None,
         batch_size: int = 0,
     ):
         """Initialize the object"""
         self.model_key = model_key
+        """A tuple containing a (key, descriptor) pair"""
         self.raw_model = raw_model
+        """Raw bytes of an ML model"""
         self.callback = callback
+        """The channel used for notification of inference completion"""
         self.raw_inputs = raw_inputs or []
+        """Raw bytes of tensor inputs"""
         self.input_keys = input_keys or []
+        """A list of tuples containing a (key, descriptor) pair"""
         self.input_meta = input_meta or []
+        """Metadata about the input data"""
         self.output_keys = output_keys or []
+        """A list of tuples containing a (key, descriptor) pair"""
         self.batch_size = batch_size
+        """The batch size to apply when batching"""
 
 
 class InferenceReply:
@@ -148,13 +157,93 @@ def __init__(self, result: bytes) -> None:
 class MachineLearningWorkerCore:
     """Basic functionality of ML worker that is shared across all worker types"""
 
+    @staticmethod
+    def deserialize_message(
+        data_blob: bytes,
+        channel_type: t.Type[CommChannelBase],
+    ) -> InferenceRequest:
+        """Deserialize a message from a byte stream into an InferenceRequest
+        :param data_blob: The byte stream to deserialize"""
+        request = MessageHandler.deserialize_request(data_blob)
+        model_key: t.Optional[FeatureStoreKey] = None
+        model_bytes: t.Optional[Model] = None
+
+        if request.model.which() == "key":
+            model_key = FeatureStoreKey(
+                request.model.key.key, request.model.key.featureStoreDescriptor
+            )
+        elif request.model.which() == "data":
+            model_bytes = request.model.data
+
+        callback_key = request.replyChannel.reply
+        comm_channel = channel_type(callback_key)
+
+        input_keys: t.Optional[t.List[FeatureStoreKey]] = None
+        input_bytes: t.Optional[t.List[bytes]] = None
+        input_meta: t.List[t.Any] = []
+
+        if request.input.which() == "keys":
+            input_keys = [
+                FeatureStoreKey(input_key.key, input_key.featureStoreDescriptor)
+                for input_key in request.input.keys
+            ]
+        elif request.input.which() == "data":
+            input_bytes = [data.blob for data in request.input.data]
+            input_meta = [data.tensorDescriptor for data in request.input.data]
+
+        output_keys: t.List[FeatureStoreKey] = []
+        if request.output:
+            output_keys = [
+                FeatureStoreKey(output_key.key, output_key.featureStoreDescriptor)
+                for output_key in request.output
+            ]
+
+        inference_request = InferenceRequest(
+            model_key=model_key,
+            callback=comm_channel,
+            raw_inputs=input_bytes,
+            input_meta=input_meta,
+            input_keys=input_keys,
+            output_keys=output_keys,
+            raw_model=model_bytes,
+            batch_size=0,
+        )
+        return inference_request
+
+    @staticmethod
+    def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]:
+        prepared_outputs: t.List[t.Any] = []
+        if reply.output_keys:
+            for fs_key in reply.output_keys:
+                if not fs_key:
+                    continue
+
+                msg_key = MessageHandler.build_tensor_key(fs_key.key, fs_key.descriptor)
+                prepared_outputs.append(msg_key)
+        elif reply.outputs:
+            arrays: t.List[np.ndarray[t.Any, np.dtype[t.Any]]] = [
+                output.numpy() for output in reply.outputs
+            ]
+            for tensor in arrays:
+                # todo: need to have the output attributes specified in the req?
+                # maybe, add `MessageHandler.dtype_of(tensor)`?
+                # can `build_tensor` do dtype and shape?
+                msg_tensor = MessageHandler.build_tensor(
+                    tensor,
+                    "c",
+                    "float32",
+                    [1],
+                )
+                prepared_outputs.append(msg_tensor)
+        return prepared_outputs
+
     @staticmethod
     def fetch_model(
-        request: InferenceRequest, feature_store: t.Optional[FeatureStore]
+        request: InferenceRequest, feature_stores: t.Dict[str, FeatureStore]
     ) -> FetchModelResult:
         """Given a resource key, retrieve the raw model from a feature store
         :param request: The request that triggered the pipeline
-        :param feature_store: The feature store used for persistence
+        :param feature_stores: Available feature stores used for persistence
         :return: Raw bytes of the model"""
 
         if request.raw_model:
@@ -164,7 +253,7 @@ def fetch_model(
             # short-circuit and return the directly supplied model
             return FetchModelResult(request.raw_model.data)
 
-        if not feature_store:
+        if not feature_stores:
             raise ValueError("Feature store is required for model retrieval")
 
         if not request.model_key:
@@ -172,44 +261,47 @@ def fetch_model(
                 "Key must be provided to retrieve model from feature store"
             )
 
+        key, fsd = request.model_key.key, request.model_key.descriptor
+
         try:
-            raw_bytes: bytes = t.cast(bytes, feature_store[request.model_key])
+            feature_store = feature_stores[fsd]
+            raw_bytes: bytes = t.cast(bytes, feature_store[key])
             return FetchModelResult(raw_bytes)
         except FileNotFoundError as ex:
             logger.exception(ex)
-            raise SmartSimError(
-                f"Model could not be retrieved with key {request.model_key}"
-            ) from ex
+            raise SmartSimError(f"Model could not be retrieved with key {key}") from ex
 
     @staticmethod
     def fetch_inputs(
-        request: InferenceRequest, feature_store: t.Optional[FeatureStore]
+        request: InferenceRequest, feature_stores: t.Dict[str, FeatureStore]
     ) -> FetchInputResult:
         """Given a collection of ResourceKeys, identify the physical location
         and input metadata
         :param request: The request that triggered the pipeline
-        :param feature_store: The feature store used for persistence
+        :param feature_stores: Available feature stores used for persistence
         :return: the fetched input"""
 
         if request.raw_inputs:
             return FetchInputResult(request.raw_inputs, request.input_meta)
 
-        if not feature_store:
+        if not feature_stores:
             raise ValueError("No input and no feature store provided")
 
         if request.input_keys:
             data: t.List[bytes] = []
-            for input_ in request.input_keys:
+
+            for fs_key in request.input_keys:
                 try:
-                    tensor_bytes = t.cast(bytes, feature_store[input_])
+                    feature_store = feature_stores[fs_key.descriptor]
+                    tensor_bytes = t.cast(bytes, feature_store[fs_key.key])
                     data.append(tensor_bytes)
                 except KeyError as ex:
                     logger.exception(ex)
                     raise SmartSimError(
-                        f"Model could not be retrieved with key {input_}"
+                        f"Model could not be retrieved with key {fs_key.key}"
                     ) from ex
             return FetchInputResult(
-                data, None
+                data, meta=None
             )  # fixme: need to get both tensor and descriptor
 
         raise ValueError("No input source")
@@ -231,25 +323,26 @@ def batch_requests(
     def place_output(
         request: InferenceRequest,
         transform_result: TransformOutputResult,
-        feature_store: t.Optional[FeatureStore],
-    ) -> t.Collection[t.Optional[str]]:
+        feature_stores: t.Dict[str, FeatureStore],
+    ) -> t.Collection[t.Optional[FeatureStoreKey]]:
         """Given a collection of data, make it available as a shared resource in the
         feature store
         :param request: The request that triggered the pipeline
         :param execute_result: Results from inference
-        :param feature_store: The feature store used for persistence
+        :param feature_stores: Available feature stores used for persistence
         :return: A collection of keys that were placed in the feature store"""
-        if not feature_store:
+        if not feature_stores:
             raise ValueError("Feature store is required for output persistence")
 
-        keys: t.List[t.Optional[str]] = []
+        keys: t.List[t.Optional[FeatureStoreKey]] = []
         # need to decide how to get back to original sub-batch inputs so they can be
         # accurately placed, datum might need to include this.
 
         # Consider parallelizing all PUT feature_store operations
-        for k, v in zip(request.output_keys, transform_result.outputs):
-            feature_store[k] = v
-            keys.append(k)
+        for fs_key, v in zip(request.output_keys, transform_result.outputs):
+            feature_store = feature_stores[fs_key.descriptor]
+            feature_store[fs_key.key] = v
+            keys.append(fs_key)
 
         return keys
 
diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py
index 00670dce8..aafc4a4c2 100644
--- a/smartsim/_core/mli/message_handler.py
+++ b/smartsim/_core/mli/message_handler.py
@@ -92,7 +92,9 @@ def build_output_tensor_descriptor(
         return description
 
     @staticmethod
-    def build_tensor_key(key: str) -> data_references_capnp.TensorKey:
+    def build_tensor_key(
+        key: str, feature_store_descriptor: str
+    ) -> data_references_capnp.TensorKey:
         """
         Builds a new TensorKey message with the provided key.
 
@@ -102,6 +104,7 @@ def build_tensor_key(key: str) -> data_references_capnp.TensorKey:
         try:
             tensor_key = data_references_capnp.TensorKey.new_message()
             tensor_key.key = key
+            tensor_key.featureStoreDescriptor = feature_store_descriptor
         except Exception as e:
             raise ValueError("Error building tensor key.") from e
         return tensor_key
@@ -126,7 +129,9 @@ def build_model(data: bytes, name: str, version: str) -> model_capnp.Model:
         return model
 
     @staticmethod
-    def build_model_key(key: str) -> data_references_capnp.ModelKey:
+    def build_model_key(
+        key: str, feature_store_descriptor: str
+    ) -> data_references_capnp.ModelKey:
         """
         Builds a new ModelKey message with the provided key.
 
@@ -136,6 +141,7 @@ def build_model_key(key: str) -> data_references_capnp.ModelKey:
         try:
             model_key = data_references_capnp.ModelKey.new_message()
             model_key.key = key
+            model_key.featureStoreDescriptor = feature_store_descriptor
         except Exception as e:
             raise ValueError("Error building model key.") from e
         return model_key
@@ -498,6 +504,7 @@ def build_response(
         result: t.Union[
             t.List[tensor_capnp.TensorDescriptor],
             t.List[data_references_capnp.TensorKey],
+            None
         ],
         custom_attributes: t.Union[
             response_attributes_capnp.TorchResponseAttributes,
diff --git a/smartsim/_core/mli/mli_schemas/data/data_references.capnp b/smartsim/_core/mli/mli_schemas/data/data_references.capnp
index f37a95726..699abe5d2 100644
--- a/smartsim/_core/mli/mli_schemas/data/data_references.capnp
+++ b/smartsim/_core/mli/mli_schemas/data/data_references.capnp
@@ -28,8 +28,10 @@
 
 struct ModelKey {
   key @0 :Text;
+  featureStoreDescriptor @1 :Text;
 }
 
 struct TensorKey {
   key @0 :Text;
+  featureStoreDescriptor @1 :Text;
 }
diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi
index 6f775cf8f..bcf53e0a0 100644
--- a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi
+++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi
@@ -36,6 +36,7 @@ from typing import Iterator
 
 class ModelKey:
     key: str
+    featureStoreDescriptor: str
     @staticmethod
     @contextmanager
     def from_bytes(
@@ -71,6 +72,7 @@ class ModelKeyBuilder(ModelKey):
 
 class TensorKey:
     key: str
+    featureStoreDescriptor: str
     @staticmethod
     @contextmanager
     def from_bytes(
diff --git a/tests/dragon/utils/featurestore.py b/tests/dragon/featurestore.py
similarity index 92%
rename from tests/dragon/utils/featurestore.py
rename to tests/dragon/featurestore.py
index 93b313431..f9d4a1da2 100644
--- a/tests/dragon/utils/featurestore.py
+++ b/tests/dragon/featurestore.py
@@ -57,6 +57,13 @@ def __contains__(self, key: str) -> bool:
         :param key: Unique key of an item to retrieve from the feature store"""
         return key in self._storage
 
+    @property
+    def descriptor(self) -> str:
+        """Return a unique identifier enabling a client to connect to
+        the feature store
+        :returns: A descriptor encoded as a string"""
+        return "file-system-fs"
+
 
 class FileSystemFeatureStore(FeatureStore):
     """Alternative feature store implementation for testing. Stores all
@@ -103,6 +110,13 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path:
 
         return value
 
+    @property
+    def descriptor(self) -> str:
+        """Return a unique identifier enabling a client to connect to
+        the feature store
+        :returns: A descriptor encoded as a string"""
+        return "in-memory-fs"
+
 
 class DragonDict:
     """Mock implementation of a dragon dictionary"""
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index 00db0a9d3..d4adb3587 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -27,9 +27,13 @@
 import base64
 import os
 import pickle
+import typing as t
 
 import pytest
 
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim.error.errors import SmartSimError
+
 dragon = pytest.importorskip("dragon")
 
 import dragon.utils as du
@@ -42,7 +46,7 @@
     DragonFeatureStore,
 )
 
-from .utils.featurestore import MemoryFeatureStore
+from .featurestore import FileSystemFeatureStore, MemoryFeatureStore
 
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
@@ -93,59 +97,70 @@ def test_environment_loader_FLI_fails(monkeypatch):
 
 
 @pytest.mark.parametrize(
-    "expected_keys, expected_values",
+    "feature_stores",
     [
-        pytest.param(["key1", "key2", "key3"], ["value1", "value2", "value3"]),
-        pytest.param(["another key"], ["another value"]),
+        pytest.param([], id="No feature stores"),
+        pytest.param([MemoryFeatureStore()], id="Single feature store"),
+        pytest.param(
+            [MemoryFeatureStore(), FileSystemFeatureStore()],
+            id="Multiple feature stores",
+        ),
     ],
 )
-def test_environment_loader_memory_featurestore(
-    expected_keys, expected_values, monkeypatch
+def test_environment_loader_featurestores(
+    feature_stores: t.List[FeatureStore], monkeypatch: pytest.MonkeyPatch
 ):
-    """MemoryFeatureStores can be correctly serialized and deserialized"""
-    feature_store = MemoryFeatureStore()
-    key_value_pairs = zip(expected_keys, expected_values)
-    for k, v in key_value_pairs:
-        feature_store[k] = v
-    monkeypatch.setenv(
-        "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8")
-    )
-    config = EnvironmentConfigLoader()
-    config_feature_store = config.get_feature_store()
+    """FeatureStore can be correctly identified, serialized and deserialized"""
+    with monkeypatch.context() as m:
+        for fs in feature_stores:
+            value = base64.b64encode(pickle.dumps(fs)).decode("utf-8")
+            key = f"SSFeatureStore.{fs.descriptor}"
+            m.setenv(key, value)
+
+        config = EnvironmentConfigLoader()
+        actual_feature_stores = config.get_feature_stores()
+
+        for fs in feature_stores:
+            # Confirm that the descriptors were used as keys in the loaded feature stores
+            assert fs.descriptor in actual_feature_stores
 
-    for k, _ in key_value_pairs:
-        assert config_feature_store[k] == feature_store[k]
+            # Confirm that the value loaded from env var is a FeatureStore
+            # and it is consistent w/the key identifying it
+            loaded_fs = actual_feature_stores[fs.descriptor]
+            assert loaded_fs.descriptor == fs.descriptor
 
 
 @pytest.mark.parametrize(
-    "expected_keys, expected_values",
+    "value_to_use,error_filter",
     [
-        pytest.param(["key1", "key2", "key3"], ["value1", "value2", "value3"]),
-        pytest.param(["another key"], ["another value"]),
+        pytest.param("", "empty", id="Empty value"),
+        pytest.param("abcd", "invalid", id="Incorrectly serialized value"),
     ],
 )
-def test_environment_loader_dragon_featurestore(
-    expected_keys, expected_values, monkeypatch
+def test_environment_loader_featurestores_errors(
+    value_to_use: str, error_filter: str, monkeypatch: pytest.MonkeyPatch
 ):
-    """DragonFeatureStores can be correctly serialized and deserialized"""
-    storage = DDict()
-    feature_store = DragonFeatureStore(storage)
-    key_value_pairs = zip(expected_keys, expected_values)
-    for k, v in key_value_pairs:
-        feature_store[k] = v
-    monkeypatch.setenv(
-        "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8")
-    )
-    config = EnvironmentConfigLoader()
-    config_feature_store = config.get_feature_store()
+    """Verify that the environment loader reports an error when a feature store
+    env var is populated with something that cannot be loaded properly"""
+
+    fs = FileSystemFeatureStore()  # just use for descriptor...
+    key = f"SSFeatureStore.{fs.descriptor}"
+
+    with monkeypatch.context() as m, pytest.raises(SmartSimError) as ex:
+        m.setenv(key, value_to_use)  # <----- simulate incorrect value in env var
+
+        config = EnvironmentConfigLoader()
+        config.get_feature_stores()  # <---- kick off validation
 
-    for k, _ in key_value_pairs:
-        assert config_feature_store[k] == feature_store[k]
+    # confirm the specific key is reported in error message
+    assert key in ex.value.args[0]
+    # ensure the failure occurred during loading
+    assert error_filter in ex.value.args[0].lower()
 
 
 def test_environment_variables_not_set():
     """EnvironmentConfigLoader getters return None when environment
     variables are not set"""
     config = EnvironmentConfigLoader()
-    assert config.get_feature_store() == None
+    assert config.get_feature_stores() == {}
     assert config.get_queue() == None
diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py
index 93b313431..f9d4a1da2 100644
--- a/tests/mli/featurestore.py
+++ b/tests/mli/featurestore.py
@@ -57,6 +57,13 @@ def __contains__(self, key: str) -> bool:
         :param key: Unique key of an item to retrieve from the feature store"""
         return key in self._storage
 
+    @property
+    def descriptor(self) -> str:
+        """Return a unique identifier enabling a client to connect to
+        the feature store
+        :returns: A descriptor encoded as a string"""
+        return "file-system-fs"
+
 
 class FileSystemFeatureStore(FeatureStore):
     """Alternative feature store implementation for testing. Stores all
@@ -103,6 +110,13 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path:
 
         return value
 
+    @property
+    def descriptor(self) -> str:
+        """Return a unique identifier enabling a client to connect to
+        the feature store
+        :returns: A descriptor encoded as a string"""
+        return "in-memory-fs"
+
 
 class DragonDict:
     """Mock implementation of a dragon dictionary"""
diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/mli/test_core_machine_learning_worker.py
index cff02c9c1..5e56671c3 100644
--- a/tests/mli/test_core_machine_learning_worker.py
+++ b/tests/mli/test_core_machine_learning_worker.py
@@ -31,6 +31,7 @@
 import torch
 
 import smartsim.error as sse
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStoreKey
 from smartsim._core.mli.infrastructure.worker.worker import (
     InferenceRequest,
     MachineLearningWorkerCore,
@@ -90,11 +91,12 @@ def test_fetch_model_disk(persist_torch_model: pathlib.Path) -> None:
     worker = MachineLearningWorkerCore
     key = str(persist_torch_model)
     feature_store = FileSystemFeatureStore()
+    fsd = feature_store.descriptor
     feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes()
 
-    request = InferenceRequest(model_key=key)
+    request = InferenceRequest(model_key=FeatureStoreKey(key, fsd))
 
-    fetch_result = worker.fetch_model(request, feature_store)
+    fetch_result = worker.fetch_model(request, {fsd: feature_store})
     assert fetch_result.model_bytes
     assert fetch_result.model_bytes == persist_torch_model.read_bytes()
 
@@ -104,13 +106,14 @@ def test_fetch_model_disk_missing() -> None:
     when given an invalid (file system) key"""
     worker = MachineLearningWorkerCore
     feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
 
     key = "/path/that/doesnt/exist"
 
-    request = InferenceRequest(model_key=key)
+    request = InferenceRequest(model_key=FeatureStoreKey(key, fsd))
 
     with pytest.raises(sse.SmartSimError) as ex:
-        worker.fetch_model(request, feature_store)
+        worker.fetch_model(request, {fsd: feature_store})
 
     # ensure the error message includes key-identifying information
     assert key in ex.value.args[0]
@@ -127,10 +130,11 @@ def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None:
 
     # put model bytes into the feature store
     feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
     feature_store[key] = persist_torch_model.read_bytes()
 
-    request = InferenceRequest(model_key=key)
-    fetch_result = worker.fetch_model(request, feature_store)
+    request = InferenceRequest(model_key=FeatureStoreKey(key, feature_store.descriptor))
+    fetch_result = worker.fetch_model(request, {fsd: feature_store})
     assert fetch_result.model_bytes
     assert fetch_result.model_bytes == persist_torch_model.read_bytes()
 
@@ -142,12 +146,15 @@ def test_fetch_model_feature_store_missing() -> None:
 
     bad_key = "some-key"
     feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
 
-    request = InferenceRequest(model_key=bad_key)
+    request = InferenceRequest(
+        model_key=FeatureStoreKey(bad_key, feature_store.descriptor)
+    )
 
     # todo: consider that raising this exception shows impl. replace...
     with pytest.raises(sse.SmartSimError) as ex:
-        worker.fetch_model(request, feature_store)
+        worker.fetch_model(request, {fsd: feature_store})
 
     # ensure the error message includes key-identifying information
     assert bad_key in ex.value.args[0]
@@ -161,11 +168,12 @@ def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None:
 
     key = "test-model"
     feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
     feature_store[key] = persist_torch_model.read_bytes()
 
-    request = InferenceRequest(model_key=key)
+    request = InferenceRequest(model_key=FeatureStoreKey(key, feature_store.descriptor))
 
-    fetch_result = worker.fetch_model(request, feature_store)
+    fetch_result = worker.fetch_model(request, {fsd: feature_store})
     assert fetch_result.model_bytes
     assert fetch_result.model_bytes == persist_torch_model.read_bytes()
 
@@ -176,13 +184,14 @@ def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None:
     when given a valid (file system) key"""
     tensor_name = str(persist_torch_tensor)
 
-    request = InferenceRequest(input_keys=[tensor_name])
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    request = InferenceRequest(input_keys=[FeatureStoreKey(tensor_name, fsd)])
     worker = MachineLearningWorkerCore
 
-    feature_store = MemoryFeatureStore()
     feature_store[tensor_name] = persist_torch_tensor.read_bytes()
 
-    fetch_result = worker.fetch_inputs(request, feature_store)
+    fetch_result = worker.fetch_inputs(request, {fsd: feature_store})
     assert fetch_result.inputs is not None
 
 
@@ -191,16 +200,17 @@ def test_fetch_input_disk_missing() -> None:
     when given an invalid (file system) key"""
     worker = MachineLearningWorkerCore
 
-    key = "/path/that/doesnt/exist"
     feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    key = "/path/that/doesnt/exist", fsd
 
-    request = InferenceRequest(input_keys=[key])
+    request = InferenceRequest(input_keys=[FeatureStoreKey(key, fsd)])
 
     with pytest.raises(sse.SmartSimError) as ex:
-        worker.fetch_inputs(request, feature_store)
+        worker.fetch_inputs(request, {fsd: feature_store})
 
     # ensure the error message includes key-identifying information
-    assert key in ex.value.args[0]
+    assert key[0] in ex.value.args[0]
 
 
 @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
@@ -211,13 +221,14 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None:
 
     tensor_name = "test-tensor"
     feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
 
-    request = InferenceRequest(input_keys=[tensor_name])
+    request = InferenceRequest(input_keys=[FeatureStoreKey(tensor_name, fsd)])
 
     # put model bytes into the feature store
     feature_store[tensor_name] = persist_torch_tensor.read_bytes()
 
-    fetch_result = worker.fetch_inputs(request, feature_store)
+    fetch_result = worker.fetch_inputs(request, {fsd: feature_store})
     assert fetch_result.inputs
     assert list(fetch_result.inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10]
 
@@ -230,6 +241,7 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) ->
 
     tensor_name = "test-tensor"
     feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
 
     # put model bytes into the feature store
     body1 = persist_torch_tensor.read_bytes()
@@ -242,10 +254,14 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) ->
     feature_store[tensor_name + "3"] = body3
 
     request = InferenceRequest(
-        input_keys=[tensor_name + "1", tensor_name + "2", tensor_name + "3"]
+        input_keys=[
+            FeatureStoreKey(tensor_name + "1", fsd),
+            FeatureStoreKey(tensor_name + "2", fsd),
+            FeatureStoreKey(tensor_name + "3", fsd),
+        ]
     )
 
-    fetch_result = worker.fetch_inputs(request, feature_store)
+    fetch_result = worker.fetch_inputs(request, {fsd: feature_store})
 
     raw_bytes = list(fetch_result.inputs)
     assert raw_bytes
@@ -261,10 +277,11 @@ def test_fetch_input_feature_store_missing() -> None:
 
     bad_key = "some-key"
     feature_store = MemoryFeatureStore()
-    request = InferenceRequest(input_keys=[bad_key])
+    fsd = feature_store.descriptor
+    request = InferenceRequest(input_keys=[FeatureStoreKey(bad_key, fsd)])
 
     with pytest.raises(sse.SmartSimError) as ex:
-        worker.fetch_inputs(request, feature_store)
+        worker.fetch_inputs(request, {fsd: feature_store})
 
     # ensure the error message includes key-identifying information
     assert bad_key in ex.value.args[0]
@@ -276,12 +293,13 @@ def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None:
     when given a valid (file system) key"""
     worker = MachineLearningWorkerCore
     feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
 
     model_name = "test-model"
     feature_store[model_name] = persist_torch_tensor.read_bytes()
-    request = InferenceRequest(input_keys=[model_name])
+    request = InferenceRequest(input_keys=[FeatureStoreKey(model_name, fsd)])
 
-    fetch_result = worker.fetch_inputs(request, feature_store)
+    fetch_result = worker.fetch_inputs(request, {fsd: feature_store})
     assert fetch_result.inputs is not None
 
 
@@ -304,18 +322,23 @@ def test_place_outputs() -> None:
 
     key_name = "test-model"
     feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
 
     # create a key to retrieve from the feature store
-    keys = [key_name + "1", key_name + "2", key_name + "3"]
+    keys = [
+        FeatureStoreKey(key_name + "1", fsd),
+        FeatureStoreKey(key_name + "2", fsd),
+        FeatureStoreKey(key_name + "3", fsd),
+    ]
     data = [b"abcdef", b"ghijkl", b"mnopqr"]
 
-    for k, v in zip(keys, data):
-        feature_store[k] = v
+    for fsk, v in zip(keys, data):
+        feature_store[fsk.key] = v
 
     request = InferenceRequest(output_keys=keys)
     transform_result = TransformOutputResult(data, [1], "c", "float32")
 
-    worker.place_output(request, transform_result, feature_store)
+    worker.place_output(request, transform_result, {fsd: feature_store})
 
     for i in range(3):
-        assert feature_store[keys[i]] == data[i]
+        assert feature_store[keys[i].key] == data[i]
diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py
index b73e4a31b..29865ac5b 100644
--- a/tests/mli/test_torch_worker.py
+++ b/tests/mli/test_torch_worker.py
@@ -26,12 +26,12 @@
 
 import io
 
-import numpy as np
 import pytest
 import torch
 from torch import nn
 from torch.nn import functional as F
 
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStoreKey
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim._core.mli.infrastructure.worker.worker import (
     ExecuteResult,
@@ -102,7 +102,7 @@ def get_request() -> InferenceRequest:
     ]
 
     return InferenceRequest(
-        model_key="model",
+        model_key=FeatureStoreKey("model", ""),
         callback=None,
         raw_inputs=tensor_numpy,
         input_keys=None,
diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index df4b0a637..5abc3852f 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -44,7 +44,6 @@
 from smartsim.log import get_logger
 
 from .channel import FileSystemCommChannel
-from .featurestore import FileSystemFeatureStore
 from .worker import IntegratedTorchWorker
 
 logger = get_logger(__name__)
@@ -139,10 +138,11 @@ def mock_messages(
         tensor = torch.randn((1, 2), dtype=torch.float32)
         torch.save(tensor, buffer)
         feature_store[input_key] = buffer.getvalue()
+        fsd = feature_store.descriptor()
 
-        message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
-        message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
-        message_model_key = MessageHandler.build_model_key(model_key)
+        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
+        message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd)
+        message_model_key = MessageHandler.build_model_key(model_key, fsd)
 
         request = MessageHandler.build_request(
             reply_channel=callback_channel.descriptor,
@@ -183,11 +183,14 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
     )
 
     # create a mock client application to populate the request queue
+    feature_stores = config_loader.get_feature_stores()
+    fs_list = list(feature_stores.values())
+
     msg_pump = mp.Process(
         target=mock_messages,
         args=(
             config_loader.get_queue(),
-            config_loader.get_feature_store(),
+            fs_list[0],
             fs_path,
             comm_path,
         ),
diff --git a/tests/test_message_handler/test_build_model_key.py b/tests/test_message_handler/test_build_model_key.py
index 135e96798..c09c787fc 100644
--- a/tests/test_message_handler/test_build_model_key.py
+++ b/tests/test_message_handler/test_build_model_key.py
@@ -35,10 +35,13 @@
 
 
 def test_build_model_key_successful():
-    model_key = handler.build_model_key("tensor_key")
+    fsd = "mock-feature-store-descriptor"
+    model_key = handler.build_model_key("tensor_key", fsd)
     assert model_key.key == "tensor_key"
+    assert model_key.featureStoreDescriptor == fsd
 
 
 def test_build_model_key_unsuccessful():
     with pytest.raises(ValueError):
-        model_key = handler.build_model_key(100)
+        fsd = "mock-feature-store-descriptor"
+        model_key = handler.build_model_key(100, fsd)
diff --git a/tests/test_message_handler/test_build_tensor_key.py b/tests/test_message_handler/test_build_tensor_key.py
index 7abe9e853..6a28b80c4 100644
--- a/tests/test_message_handler/test_build_tensor_key.py
+++ b/tests/test_message_handler/test_build_tensor_key.py
@@ -35,10 +35,12 @@
 
 
 def test_build_tensor_key_successful():
-    tensor_key = handler.build_tensor_key("tensor_key")
+    fsd = "mock-feature-store-descriptor"
+    tensor_key = handler.build_tensor_key("tensor_key", fsd)
     assert tensor_key.key == "tensor_key"
 
 
 def test_build_tensor_key_unsuccessful():
     with pytest.raises(ValueError):
-        tensor_key = handler.build_tensor_key(100)
+        fsd = "mock-feature-store-descriptor"
+        tensor_key = handler.build_tensor_key(100, fsd)
diff --git a/tests/test_message_handler/test_output_descriptor.py b/tests/test_message_handler/test_output_descriptor.py
index fd21eeb0d..beb9a4765 100644
--- a/tests/test_message_handler/test_output_descriptor.py
+++ b/tests/test_message_handler/test_output_descriptor.py
@@ -33,7 +33,8 @@
 
 handler = MessageHandler()
 
-tensor_key = handler.build_tensor_key("key")
+fsd = "mock-feature-store-descriptor"
+tensor_key = handler.build_tensor_key("key", fsd)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py
index 4cfc11584..ea9b04d64 100644
--- a/tests/test_message_handler/test_request.py
+++ b/tests/test_message_handler/test_request.py
@@ -31,14 +31,16 @@
 # The tests in this file belong to the group_a group
 pytestmark = pytest.mark.group_a
 
-model_key = MessageHandler.build_model_key("model_key")
+fsd = "mock-feature-store-descriptor"
+
+model_key = MessageHandler.build_model_key("model_key", fsd)
 model = MessageHandler.build_model(b"model data", "model_name", "v0.0.1")
 
-input_key1 = MessageHandler.build_tensor_key("input_key1")
-input_key2 = MessageHandler.build_tensor_key("input_key2")
+input_key1 = MessageHandler.build_tensor_key("input_key1", fsd)
+input_key2 = MessageHandler.build_tensor_key("input_key2", fsd)
 
-output_key1 = MessageHandler.build_tensor_key("output_key1")
-output_key2 = MessageHandler.build_tensor_key("output_key2")
+output_key1 = MessageHandler.build_tensor_key("output_key1", fsd)
+output_key2 = MessageHandler.build_tensor_key("output_key2", fsd)
 
 output_descriptor1 = MessageHandler.build_output_tensor_descriptor(
     "c", [output_key1, output_key2], "int64", []
diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py
index 03bd9ba73..d6894eb5c 100644
--- a/tests/test_message_handler/test_response.py
+++ b/tests/test_message_handler/test_response.py
@@ -31,9 +31,10 @@
 # The tests in this file belong to the group_a group
 pytestmark = pytest.mark.group_a
 
+fsd = "mock-feature-store-descriptor"
 
-result_key1 = MessageHandler.build_tensor_key("result_key1")
-result_key2 = MessageHandler.build_tensor_key("result_key2")
+result_key1 = MessageHandler.build_tensor_key("result_key1", fsd)
+result_key2 = MessageHandler.build_tensor_key("result_key2", fsd)
 
 torch_attributes = MessageHandler.build_torch_response_attributes()
 tf_attributes = MessageHandler.build_tf_response_attributes()

From f2528061dffacd97e56df6a24f2461e5295c87c2 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Thu, 18 Jul 2024 18:51:06 -0500
Subject: [PATCH 02/49] post-merge tweaks

---
 ex/high_throughput_inference/mock_app.py      |  2 +-
 .../infrastructure/control/workermanager.py   | 51 +++++++++----------
 .../mli/infrastructure/environmentloader.py   | 18 +++----
 .../_core/mli/infrastructure/worker/worker.py | 47 +++++++----------
 smartsim/_core/mli/message_handler.py         |  3 +-
 tests/dragon/test_reply_building.py           | 31 +----------
 tests/mli/test_worker_manager.py              |  7 ++-
 7 files changed, 56 insertions(+), 103 deletions(-)

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index e244c93e0..236fab419 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -115,7 +115,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         self.measure_time("build_tensor_descriptor")
         built_model = None
         if isinstance(model, str):
-            model_arg = MessageHandler.build_model_key(model)
+            model_arg = MessageHandler.build_model_key(model)  # todo: this needs FSD
         else:
             model_arg = MessageHandler.build_model(model, "resnet-50", "1.0")
         request = MessageHandler.build_request(
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 79ffc6dbd..e34f711f5 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -34,7 +34,6 @@
 from ...comm.channel.channel import CommChannelBase
 from ...comm.channel.dragonchannel import DragonCommChannel
 from ...infrastructure.environmentloader import EnvironmentConfigLoader
-from ...infrastructure.storage.featurestore import FeatureStore
 from ...infrastructure.worker.worker import (
     InferenceReply,
     InferenceRequest,
@@ -54,32 +53,18 @@
 logger = get_logger(__name__)
 
 
-def build_failure_reply(status: "StatusEnum", message: str) -> Response:
+def build_failure_reply(status: "Status", message: str) -> ResponseBuilder:
     """Build a response indicating a failure occurred
     :param status: The status of the response
     :param message: The error message to include in the response"""
     return MessageHandler.build_response(
-        status=status,  # todo: need to indicate correct status
-        message=message,  # todo: decide what these will be
+        status=status,
+        message=message,
         result=None,
         custom_attributes=None,
     )
 
 
-def build_reply(worker: MachineLearningWorkerBase, reply: InferenceReply) -> Response:
-    """Builds a response for a successful inference request
-    :param worker: A worker to process the reply with
-    :param reply: The internal representation of the reply"""
-    results = worker.prepare_outputs(reply)
-
-    return MessageHandler.build_response(
-        status=reply.status_enum,
-        message=reply.message,
-        result=results,
-        custom_attributes=None,
-    )
-
-
 def exception_handler(
     exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str
 ) -> None:
@@ -143,13 +128,15 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool:
         """Ensures that all feature stores required by the request are available
         :param request: The request to validate"""
         # collect all feature stores required by the request
-        fs_model = {request.model_key.descriptor}
+        fs_model: t.Set[str] = set()
+        if request.model_key:
+            fs_model = {request.model_key.descriptor}
         fs_inputs = {key.descriptor for key in request.input_keys}
         fs_outputs = {key.descriptor for key in request.output_keys}
 
         # identify which feature stores are requested and unknown
-        fs_desired = fs_model + fs_inputs + fs_outputs
-        fs_actual = {key for key in self._feature_stores}
+        fs_desired = fs_model.union(fs_inputs).union(fs_outputs)
+        fs_actual = {item.descriptor for item in self._feature_stores.values()}
         fs_missing = fs_desired - fs_actual
 
         # exit if all desired feature stores are not available
@@ -259,7 +246,7 @@ def _on_iteration(self) -> None:
                 interm = time.perf_counter()  # timing
                 try:
                     fetch_model_result = self._worker.fetch_model(
-                        request, self._feature_store
+                        request, self._feature_stores
                     )
                 except Exception as e:
                     exception_handler(
@@ -287,7 +274,7 @@ def _on_iteration(self) -> None:
             interm = time.perf_counter()  # timing
             try:
                 fetch_model_result = self._worker.fetch_model(
-                    request, self._feature_store
+                    request, self._feature_stores
                 )
             except Exception as e:
                 exception_handler(
@@ -310,7 +297,9 @@ def _on_iteration(self) -> None:
         timings.append(time.perf_counter() - interm)  # timing
         interm = time.perf_counter()  # timing
         try:
-            fetch_input_result = self._worker.fetch_inputs(request, self._feature_store)
+            fetch_input_result = self._worker.fetch_inputs(
+                request, self._feature_stores
+            )
         except Exception as e:
             exception_handler(e, request.callback, "Failed while fetching the inputs.")
             return
@@ -370,10 +359,16 @@ def _on_iteration(self) -> None:
         if reply.outputs is None or not reply.outputs:
             response = build_failure_reply("fail", "Outputs not found.")
         else:
-            if reply.outputs is None or not reply.outputs:
-                response = build_failure_reply("fail", "no-results")
-
-            response = build_reply(self._worker, reply)
+            reply.status_enum = "complete"
+            reply.message = "Success"
+
+            results = self._worker.prepare_outputs(reply)
+            response = MessageHandler.build_response(
+                status=reply.status_enum,
+                message=reply.message,
+                result=results,
+                custom_attributes=None,
+            )
 
         timings.append(time.perf_counter() - interm)  # timing
         interm = time.perf_counter()  # timing
diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index 4502ec2fc..9125ac1cd 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -46,13 +46,10 @@ class EnvironmentConfigLoader:
     """
 
     def __init__(self) -> None:
-        self._feature_store_descriptor: t.Optional[str] = os.getenv(
-            "SSFeatureStore", None
-        )
         self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None)
-        self.feature_store: t.Optional[FeatureStore] = None
-        self.feature_stores: t.Optional[t.Dict[FeatureStore]] = None
+        self.feature_stores: t.Optional[t.Dict[str, FeatureStore]] = None
         self.queue: t.Optional[DragonFLIChannel] = None
+        self._prefix = "SSFeatureStore"
 
     def _load_feature_store(self, env_var: str) -> FeatureStore:
         """Load a feature store from a descriptor
@@ -62,10 +59,12 @@ def _load_feature_store(self, env_var: str) -> FeatureStore:
 
         value = os.getenv(env_var)
         if not value:
-            raise SmartSimError(f"Empty feature store descriptor in environment: {env_var}")
+            raise SmartSimError(
+                f"Empty feature store descriptor in environment: {env_var}"
+            )
 
         try:
-            return pickle.loads(base64.b64decode(value))
+            return t.cast(FeatureStore, pickle.loads(base64.b64decode(value)))
         except:
             raise SmartSimError(
                 f"Invalid feature store descriptor in environment: {env_var}"
@@ -74,9 +73,8 @@ def _load_feature_store(self, env_var: str) -> FeatureStore:
     def get_feature_stores(self) -> t.Dict[str, FeatureStore]:
         """Loads multiple Feature Stores by scanning environment for variables
         prefixed with `SSFeatureStore`"""
-        prefix = "SSFeatureStore"
-        if self.feature_stores is None:
-            env_vars = [var for var in os.environ if var.startswith(prefix)]
+        if not self.feature_stores:
+            env_vars = [var for var in os.environ if var.startswith(self._prefix)]
             stores = [self._load_feature_store(var) for var in env_vars]
             self.feature_stores = {fs.descriptor: fs for fs in stores}
         return self.feature_stores
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index f7b053245..74efec976 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -27,8 +27,6 @@
 import typing as t
 from abc import ABC, abstractmethod
 
-import numpy as np
-
 from .....error import SmartSimError
 from .....log import get_logger
 from ...comm.channel.channel import CommChannelBase
@@ -38,6 +36,7 @@
 
 if t.TYPE_CHECKING:
     from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+    from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor
 
 logger = get_logger(__name__)
 
@@ -81,13 +80,13 @@ class InferenceReply:
     def __init__(
         self,
         outputs: t.Optional[t.Collection[t.Any]] = None,
-        output_keys: t.Optional[t.Collection[str]] = None,
+        output_keys: t.Optional[t.Collection[FeatureStoreKey]] = None,
         status_enum: "Status" = "running",
         message: str = "In progress",
     ) -> None:
         """Initialize the object"""
         self.outputs: t.Collection[t.Any] = outputs or []
-        self.output_keys: t.Collection[t.Optional[str]] = output_keys or []
+        self.output_keys: t.Collection[t.Optional[FeatureStoreKey]] = output_keys or []
         self.status_enum = status_enum
         self.message = message
 
@@ -175,27 +174,25 @@ def deserialize_message(
         elif request.model.which() == "data":
             model_bytes = request.model.data
 
-        callback_key = request.replyChannel.reply
+        callback_key = request.replyChannel.descriptor
         comm_channel = channel_type(callback_key)
-
         input_keys: t.Optional[t.List[FeatureStoreKey]] = None
         input_bytes: t.Optional[t.List[bytes]] = None
-        input_meta: t.List[t.Any] = []
+        output_keys: t.Optional[t.List[FeatureStoreKey]] = None
+        input_meta: t.Optional[t.List[TensorDescriptor]] = None
 
         if request.input.which() == "keys":
             input_keys = [
-                FeatureStoreKey(input_key.key, input_key.featureStoreDescriptor)
-                for input_key in request.input.keys
+                FeatureStoreKey(value.key, value.featureStoreDescriptor)
+                for value in request.input.keys
             ]
-        elif request.input.which() == "data":
-            input_bytes = [data.blob for data in request.input.data]
-            input_meta = [data.tensorDescriptor for data in request.input.data]
+        elif request.input.which() == "descriptors":
+            input_meta = request.input.descriptors  # type: ignore
 
-        output_keys: t.List[FeatureStoreKey] = []
         if request.output:
             output_keys = [
-                FeatureStoreKey(output_key.key, output_key.featureStoreDescriptor)
-                for output_key in request.output
+                FeatureStoreKey(value.key, value.featureStoreDescriptor)
+                for value in request.output
             ]
 
         inference_request = InferenceRequest(
@@ -214,27 +211,19 @@ def deserialize_message(
     def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]:
         prepared_outputs: t.List[t.Any] = []
         if reply.output_keys:
-            for fs_key in reply.output_keys:
-                if not fs_key:
+            for value in reply.output_keys:
+                if not value:
                     continue
-
-                msg_key = MessageHandler.build_tensor_key(fs_key.key, fs_key.descriptor)
+                msg_key = MessageHandler.build_tensor_key(value.key, value.descriptor)
                 prepared_outputs.append(msg_key)
         elif reply.outputs:
-            arrays: t.List[np.ndarray[t.Any, np.dtype[t.Any]]] = [
-                output.numpy() for output in reply.outputs
-            ]
-            for tensor in arrays:
-                # todo: need to have the output attributes specified in the req?
-                # maybe, add `MessageHandler.dtype_of(tensor)`?
-                # can `build_tensor` do dtype and shape?
-                msg_tensor = MessageHandler.build_tensor(
-                    tensor,
+            for _ in reply.outputs:
+                msg_tensor_desc = MessageHandler.build_tensor_descriptor(
                     "c",
                     "float32",
                     [1],
                 )
-                prepared_outputs.append(msg_tensor)
+                prepared_outputs.append(msg_tensor_desc)
         return prepared_outputs
 
     @staticmethod
diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py
index aafc4a4c2..bbd74fd28 100644
--- a/smartsim/_core/mli/message_handler.py
+++ b/smartsim/_core/mli/message_handler.py
@@ -439,6 +439,7 @@ def _assign_result(
         result: t.Union[
             t.List[tensor_capnp.TensorDescriptor],
             t.List[data_references_capnp.TensorKey],
+            None,
         ],
     ) -> None:
         """
@@ -504,7 +505,7 @@ def build_response(
         result: t.Union[
             t.List[tensor_capnp.TensorDescriptor],
             t.List[data_references_capnp.TensorKey],
-            None
+            None,
         ],
         custom_attributes: t.Union[
             response_attributes_capnp.TorchResponseAttributes,
diff --git a/tests/dragon/test_reply_building.py b/tests/dragon/test_reply_building.py
index d1c4d226b..5f179bbae 100644
--- a/tests/dragon/test_reply_building.py
+++ b/tests/dragon/test_reply_building.py
@@ -30,10 +30,7 @@
 
 dragon = pytest.importorskip("dragon")
 
-from smartsim._core.mli.infrastructure.control.workermanager import (
-    build_failure_reply,
-    build_reply,
-)
+from smartsim._core.mli.infrastructure.control.workermanager import build_failure_reply
 from smartsim._core.mli.infrastructure.worker.worker import InferenceReply
 
 if t.TYPE_CHECKING:
@@ -63,29 +60,3 @@ def test_build_failure_reply_fails():
         response = build_failure_reply("not a status enum", "message")
 
     assert "Error assigning status to response" in ex.value.args[0]
-
-
-@pytest.mark.parametrize(
-    "status, message",
-    [
-        pytest.param("complete", "Success", id="complete"),
-    ],
-)
-def test_build_reply(status: "Status", message: str):
-    "Ensures replies can be built successfully"
-    reply = InferenceReply()
-    reply.status_enum = status
-    reply.message = message
-    response = build_reply(reply)
-    assert response.status == status
-    assert response.message == message
-
-
-def test_build_reply_fails():
-    "Ensures ValueError is raised if a Status Enum is not used"
-    with pytest.raises(ValueError) as ex:
-        reply = InferenceReply()
-        reply.status_enum = "not a status enum"
-        response = build_reply(reply)
-
-    assert "Error assigning status to response" in ex.value.args[0]
diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index 5abc3852f..dc4c026c0 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -32,6 +32,8 @@
 
 import pytest
 
+from tests.mli.featurestore import FileSystemFeatureStore
+
 torch = pytest.importorskip("torch")
 dragon = pytest.importorskip("dragon")
 
@@ -183,14 +185,11 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
     )
 
     # create a mock client application to populate the request queue
-    feature_stores = config_loader.get_feature_stores()
-    fs_list = list(feature_stores.values())
-
     msg_pump = mp.Process(
         target=mock_messages,
         args=(
             config_loader.get_queue(),
-            fs_list[0],
+            FileSystemFeatureStore(fs_path),
             fs_path,
             comm_path,
         ),

From 09eff200fc397e02fe83c3e62ba5b212856a5712 Mon Sep 17 00:00:00 2001
From: Christopher McBride <3595025+ankona@users.noreply.github.com>
Date: Thu, 18 Jul 2024 20:56:54 -0400
Subject: [PATCH 03/49] update upstream tests

---
 .../mli/infrastructure/environmentloader.py   |  4 +-
 .../infrastructure/storage/featurestore.py    | 15 +++---
 .../_core/mli/infrastructure/worker/worker.py | 12 +++--
 smartsim/_core/mli/message_handler.py         | 34 ++++++++----
 tests/dragon/test_error_handling.py           |  6 +--
 .../mli/test_core_machine_learning_worker.py  | 54 +++++++++++--------
 tests/mli/test_torch_worker.py                |  2 +-
 7 files changed, 77 insertions(+), 50 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index 9125ac1cd..983afc00c 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -49,7 +49,7 @@ def __init__(self) -> None:
         self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None)
         self.feature_stores: t.Optional[t.Dict[str, FeatureStore]] = None
         self.queue: t.Optional[DragonFLIChannel] = None
-        self._prefix = "SSFeatureStore"
+        self._feature_store_prefix = "SSFeatureStore"
 
     def _load_feature_store(self, env_var: str) -> FeatureStore:
         """Load a feature store from a descriptor
@@ -74,7 +74,7 @@ def get_feature_stores(self) -> t.Dict[str, FeatureStore]:
         """Loads multiple Feature Stores by scanning environment for variables
         prefixed with `SSFeatureStore`"""
         if not self.feature_stores:
-            env_vars = [var for var in os.environ if var.startswith(self._prefix)]
+            env_vars = [var for var in os.environ if var.startswith(self._feature_store_prefix)]
             stores = [self._load_feature_store(var) for var in env_vars]
             self.feature_stores = {fs.descriptor: fs for fs in stores}
         return self.feature_stores
diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py
index 5291b2db3..49f16af8a 100644
--- a/smartsim/_core/mli/infrastructure/storage/featurestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py
@@ -27,15 +27,16 @@
 import typing as t
 from abc import ABC, abstractmethod
 
+from pydantic import BaseModel, Field
 
-class FeatureStoreKey:
-    """A key-value pair enabling retrieval of an item in a feature store"""
 
-    def __init__(self, key: str, descriptor: str) -> None:
-        self.key = key
-        """The unique key of an item in the feature store"""
-        self.descriptor = descriptor
-        """The unique identifier of the feature store containing the key"""
+class FeatureStoreKey(BaseModel):
+    """A key,descriptor pair enabling retrieval of an item from a feature store"""
+
+    key: str = Field(min_length=1)
+    """The unique key of an item in a feature store"""
+    descriptor: str = Field(min_length=1)
+    """The unique identifier of the feature store containing the key"""
 
 
 class FeatureStore(ABC):
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 74efec976..984fd10df 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -162,14 +162,18 @@ def deserialize_message(
         channel_type: t.Type[CommChannelBase],
     ) -> InferenceRequest:
         """Deserialize a message from a byte stream into an InferenceRequest
-        :param data_blob: The byte stream to deserialize"""
+        :param data_blob: The byte stream to deserialize
+        :param channel_type: Type to be used for callback communications
+        :returns: The raw input message deserialized into an InferenceRequest
+        """
         request = MessageHandler.deserialize_request(data_blob)
         model_key: t.Optional[FeatureStoreKey] = None
         model_bytes: t.Optional[Model] = None
 
         if request.model.which() == "key":
             model_key = FeatureStoreKey(
-                request.model.key.key, request.model.key.featureStoreDescriptor
+                key=request.model.key.key,
+                descriptor=request.model.key.featureStoreDescriptor,
             )
         elif request.model.which() == "data":
             model_bytes = request.model.data
@@ -183,7 +187,7 @@ def deserialize_message(
 
         if request.input.which() == "keys":
             input_keys = [
-                FeatureStoreKey(value.key, value.featureStoreDescriptor)
+                FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor)
                 for value in request.input.keys
             ]
         elif request.input.which() == "descriptors":
@@ -191,7 +195,7 @@ def deserialize_message(
 
         if request.output:
             output_keys = [
-                FeatureStoreKey(value.key, value.featureStoreDescriptor)
+                FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor)
                 for value in request.output
             ]
 
diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py
index bbd74fd28..cb5e56df2 100644
--- a/smartsim/_core/mli/message_handler.py
+++ b/smartsim/_core/mli/message_handler.py
@@ -99,6 +99,8 @@ def build_tensor_key(
         Builds a new TensorKey message with the provided key.
 
         :param key: String to set the TensorKey
+        :param feature_store_descriptor: A descriptor identifying the feature store
+        containing the key
         :raises ValueError: if building fails
         """
         try:
@@ -136,6 +138,8 @@ def build_model_key(
         Builds a new ModelKey message with the provided key.
 
         :param key: String to set the ModelKey
+        :param feature_store_descriptor: A descriptor identifying the feature store
+        containing the key
         :raises ValueError: if building fails
         """
         try:
@@ -222,8 +226,10 @@ def _assign_model(
             elif class_name == "ModelKey":
                 request.model.key = model  # type: ignore
             else:
-                raise ValueError("""Invalid custom attribute class name.
-                        Expected 'Model' or 'ModelKey'.""")
+                raise ValueError(
+                    """Invalid custom attribute class name.
+                        Expected 'Model' or 'ModelKey'."""
+                )
         except Exception as e:
             raise ValueError("Error building model portion of request.") from e
 
@@ -267,8 +273,10 @@ def _assign_inputs(
                 elif input_class_name == "TensorKey":
                     request.input.keys = inputs  # type: ignore
                 else:
-                    raise ValueError("""Invalid input class name. Expected
-                        'TensorDescriptor' or 'TensorKey'.""")
+                    raise ValueError(
+                        """Invalid input class name. Expected
+                        'TensorDescriptor' or 'TensorKey'."""
+                    )
         except Exception as e:
             raise ValueError("Error building inputs portion of request.") from e
 
@@ -337,9 +345,11 @@ def _assign_custom_request_attributes(
                 elif custom_attribute_class_name == "TensorFlowRequestAttributes":
                     request.customAttributes.tf = custom_attrs  # type: ignore
                 else:
-                    raise ValueError("""Invalid custom attribute class name.
+                    raise ValueError(
+                        """Invalid custom attribute class name.
                         Expected 'TensorFlowRequestAttributes' or
-                        'TorchRequestAttributes'.""")
+                        'TorchRequestAttributes'."""
+                    )
         except Exception as e:
             raise ValueError(
                 "Error building custom attributes portion of request."
@@ -459,8 +469,10 @@ def _assign_result(
                 elif result_class_name == "TensorKey":
                     response.result.keys = result  # type: ignore
                 else:
-                    raise ValueError("""Invalid custom attribute class name.
-                        Expected 'TensorDescriptor' or 'TensorKey'.""")
+                    raise ValueError(
+                        """Invalid custom attribute class name.
+                        Expected 'TensorDescriptor' or 'TensorKey'."""
+                    )
         except Exception as e:
             raise ValueError("Error assigning result to response.") from e
 
@@ -492,9 +504,11 @@ def _assign_custom_response_attributes(
                 elif custom_attribute_class_name == "TensorFlowResponseAttributes":
                     response.customAttributes.tf = custom_attrs  # type: ignore
                 else:
-                    raise ValueError("""Invalid custom attribute class name.
+                    raise ValueError(
+                        """Invalid custom attribute class name.
                         Expected 'TensorFlowResponseAttributes' or
-                        'TorchResponseAttributes'.""")
+                        'TorchResponseAttributes'."""
+                    )
         except Exception as e:
             raise ValueError("Error assigning custom attributes to response.") from e
 
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 151bdd2fc..c929c2341 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -84,7 +84,7 @@ def setup_worker_manager_model_bytes(test_dir, monkeypatch: pytest.MonkeyPatch):
         comm_channel_type=FileSystemCommChannel,
     )
 
-    tensor_key = MessageHandler.build_tensor_key("key")
+    tensor_key = MessageHandler.build_tensor_key("key", feature_store.descriptor)
     model = MessageHandler.build_model(b"model", "model name", "v 0.0.1")
     request = MessageHandler.build_request(
         test_dir, model, [tensor_key], [tensor_key], [], None
@@ -116,8 +116,8 @@ def setup_worker_manager_model_key(test_dir, monkeypatch: pytest.MonkeyPatch):
         comm_channel_type=FileSystemCommChannel,
     )
 
-    tensor_key = MessageHandler.build_tensor_key("key")
-    model_key = MessageHandler.build_model_key("model key")
+    tensor_key = MessageHandler.build_tensor_key("key", feature_store.descriptor)
+    model_key = MessageHandler.build_model_key("model key", feature_store.descriptor)
     request = MessageHandler.build_request(
         test_dir, model_key, [tensor_key], [tensor_key], [], None
     )
diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/mli/test_core_machine_learning_worker.py
index 5e56671c3..c7e1cb286 100644
--- a/tests/mli/test_core_machine_learning_worker.py
+++ b/tests/mli/test_core_machine_learning_worker.py
@@ -94,7 +94,7 @@ def test_fetch_model_disk(persist_torch_model: pathlib.Path) -> None:
     fsd = feature_store.descriptor
     feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes()
 
-    request = InferenceRequest(model_key=FeatureStoreKey(key, fsd))
+    request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd))
 
     fetch_result = worker.fetch_model(request, {fsd: feature_store})
     assert fetch_result.model_bytes
@@ -110,7 +110,7 @@ def test_fetch_model_disk_missing() -> None:
 
     key = "/path/that/doesnt/exist"
 
-    request = InferenceRequest(model_key=FeatureStoreKey(key, fsd))
+    request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd))
 
     with pytest.raises(sse.SmartSimError) as ex:
         worker.fetch_model(request, {fsd: feature_store})
@@ -133,7 +133,9 @@ def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None:
     fsd = feature_store.descriptor
     feature_store[key] = persist_torch_model.read_bytes()
 
-    request = InferenceRequest(model_key=FeatureStoreKey(key, feature_store.descriptor))
+    request = InferenceRequest(
+        model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
+    )
     fetch_result = worker.fetch_model(request, {fsd: feature_store})
     assert fetch_result.model_bytes
     assert fetch_result.model_bytes == persist_torch_model.read_bytes()
@@ -144,12 +146,12 @@ def test_fetch_model_feature_store_missing() -> None:
     when given an invalid (feature store) key"""
     worker = MachineLearningWorkerCore
 
-    bad_key = "some-key"
+    key = "some-key"
     feature_store = MemoryFeatureStore()
     fsd = feature_store.descriptor
 
     request = InferenceRequest(
-        model_key=FeatureStoreKey(bad_key, feature_store.descriptor)
+        model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
     )
 
     # todo: consider that raising this exception shows impl. replace...
@@ -157,7 +159,7 @@ def test_fetch_model_feature_store_missing() -> None:
         worker.fetch_model(request, {fsd: feature_store})
 
     # ensure the error message includes key-identifying information
-    assert bad_key in ex.value.args[0]
+    assert key in ex.value.args[0]
 
 
 @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
@@ -171,7 +173,9 @@ def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None:
     fsd = feature_store.descriptor
     feature_store[key] = persist_torch_model.read_bytes()
 
-    request = InferenceRequest(model_key=FeatureStoreKey(key, feature_store.descriptor))
+    request = InferenceRequest(
+        model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
+    )
 
     fetch_result = worker.fetch_model(request, {fsd: feature_store})
     assert fetch_result.model_bytes
@@ -186,7 +190,9 @@ def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None:
 
     feature_store = MemoryFeatureStore()
     fsd = feature_store.descriptor
-    request = InferenceRequest(input_keys=[FeatureStoreKey(tensor_name, fsd)])
+    request = InferenceRequest(
+        input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)]
+    )
     worker = MachineLearningWorkerCore
 
     feature_store[tensor_name] = persist_torch_tensor.read_bytes()
@@ -202,9 +208,9 @@ def test_fetch_input_disk_missing() -> None:
 
     feature_store = MemoryFeatureStore()
     fsd = feature_store.descriptor
-    key = "/path/that/doesnt/exist", fsd
+    key = "/path/that/doesnt/exist"
 
-    request = InferenceRequest(input_keys=[FeatureStoreKey(key, fsd)])
+    request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
 
     with pytest.raises(sse.SmartSimError) as ex:
         worker.fetch_inputs(request, {fsd: feature_store})
@@ -223,7 +229,9 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None:
     feature_store = MemoryFeatureStore()
     fsd = feature_store.descriptor
 
-    request = InferenceRequest(input_keys=[FeatureStoreKey(tensor_name, fsd)])
+    request = InferenceRequest(
+        input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)]
+    )
 
     # put model bytes into the feature store
     feature_store[tensor_name] = persist_torch_tensor.read_bytes()
@@ -255,9 +263,9 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) ->
 
     request = InferenceRequest(
         input_keys=[
-            FeatureStoreKey(tensor_name + "1", fsd),
-            FeatureStoreKey(tensor_name + "2", fsd),
-            FeatureStoreKey(tensor_name + "3", fsd),
+            FeatureStoreKey(key=tensor_name + "1", descriptor=fsd),
+            FeatureStoreKey(key=tensor_name + "2", descriptor=fsd),
+            FeatureStoreKey(key=tensor_name + "3", descriptor=fsd),
         ]
     )
 
@@ -275,16 +283,16 @@ def test_fetch_input_feature_store_missing() -> None:
     when given an invalid (feature store) key"""
     worker = MachineLearningWorkerCore
 
-    bad_key = "some-key"
+    key = "bad-key"
     feature_store = MemoryFeatureStore()
     fsd = feature_store.descriptor
-    request = InferenceRequest(input_keys=[FeatureStoreKey(bad_key, fsd)])
+    request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
 
     with pytest.raises(sse.SmartSimError) as ex:
         worker.fetch_inputs(request, {fsd: feature_store})
 
     # ensure the error message includes key-identifying information
-    assert bad_key in ex.value.args[0]
+    assert key in ex.value.args[0]
 
 
 @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
@@ -295,9 +303,9 @@ def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None:
     feature_store = MemoryFeatureStore()
     fsd = feature_store.descriptor
 
-    model_name = "test-model"
-    feature_store[model_name] = persist_torch_tensor.read_bytes()
-    request = InferenceRequest(input_keys=[FeatureStoreKey(model_name, fsd)])
+    key = "test-model"
+    feature_store[key] = persist_torch_tensor.read_bytes()
+    request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
 
     fetch_result = worker.fetch_inputs(request, {fsd: feature_store})
     assert fetch_result.inputs is not None
@@ -326,9 +334,9 @@ def test_place_outputs() -> None:
 
     # create a key to retrieve from the feature store
     keys = [
-        FeatureStoreKey(key_name + "1", fsd),
-        FeatureStoreKey(key_name + "2", fsd),
-        FeatureStoreKey(key_name + "3", fsd),
+        FeatureStoreKey(key=key_name + "1", descriptor=fsd),
+        FeatureStoreKey(key=key_name + "2", descriptor=fsd),
+        FeatureStoreKey(key=key_name + "3", descriptor=fsd),
     ]
     data = [b"abcdef", b"ghijkl", b"mnopqr"]
 
diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py
index 29865ac5b..1e8bba7e3 100644
--- a/tests/mli/test_torch_worker.py
+++ b/tests/mli/test_torch_worker.py
@@ -102,7 +102,7 @@ def get_request() -> InferenceRequest:
     ]
 
     return InferenceRequest(
-        model_key=FeatureStoreKey("model", ""),
+        model_key=FeatureStoreKey(key="model", descriptor="xyz"),
         callback=None,
         raw_inputs=tensor_numpy,
         input_keys=None,

From 2cedfb336830629c9621dcf837d36c135382c54b Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Thu, 25 Jul 2024 12:35:41 -0500
Subject: [PATCH 04/49] dynamic fs attachment, add backbone to ML worker mgr &
 config loader

---
 ex/high_throughput_inference/mli_driver.py    |  21 ++-
 ex/high_throughput_inference/mock_app.py      |  40 ++++--
 .../standalone_workermanager.py               |  24 ++--
 smartsim/_core/mli/comm/channel/dragonfli.py  |  12 +-
 .../infrastructure/control/workermanager.py   |  50 ++++++--
 .../mli/infrastructure/environmentloader.py   |  89 +++++++------
 .../storage/dragonfeaturestore.py             |  38 +++++-
 .../_core/mli/infrastructure/worker/worker.py |   4 +-
 smartsim/_core/mli/message_handler.py         |  30 ++---
 tests/dragon/featurestore.py                  |  19 ++-
 tests/dragon/test_environment_loader.py       | 120 ++++++++----------
 tests/dragon/test_error_handling.py           |  82 ++++++++----
 tests/dragon/utils/channel.py                 |  11 ++
 tests/dragon/utils/worker.py                  |   4 +-
 tests/mli/channel.py                          |  13 ++
 tests/mli/featurestore.py                     |  21 ++-
 tests/mli/test_worker_manager.py              |  74 ++++++-----
 tests/mli/worker.py                           |   4 +-
 18 files changed, 412 insertions(+), 244 deletions(-)

diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
index 6da559aa6..0cf87ef2e 100644
--- a/ex/high_throughput_inference/mli_driver.py
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -1,5 +1,4 @@
-
-
+import argparse
 import os
 import base64
 import cloudpickle
@@ -26,11 +25,23 @@
 
 torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii")
 
-worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str])
+worker_manager_rs = exp.create_run_settings(
+    sys.executable,
+    [
+        worker_manager_script_name,
+        "--device",
+        device,
+        "--worker_class",
+        torch_worker_str,
+    ],
+)
 worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
 worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
 
-app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device])
+app_rs = exp.create_run_settings(
+    sys.executable,
+    exe_args=[app_script_name, "--device", device],
+)
 app = exp.create_model("app", run_settings=app_rs)
 app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
 
@@ -47,4 +58,4 @@
         break
     time.sleep(5)
 
-print("Exiting.")
\ No newline at end of file
+print("Exiting.")
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index 236fab419..e34b2676a 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -44,16 +44,21 @@
 import numbers
 
 from collections import OrderedDict
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
 from smartsim._core.mli.message_handler import MessageHandler
 from smartsim.log import get_logger
 
 logger = get_logger("App")
 
+
 class ProtoClient:
     def __init__(self, timing_on: bool):
         connect_to_infrastructure()
         ddict_str = os.environ["SS_DRG_DDICT"]
         self._ddict = DDict.attach(ddict_str)
+        self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor
         to_worker_fli_str = None
         while to_worker_fli_str is None:
             try:
@@ -88,17 +93,23 @@ def start_timings(self, batch_size: int):
     def end_timings(self):
         if self._timing_on:
             self._add_label_to_timings("total_time")
-            self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start))
+            self._timings["total_time"].append(
+                self._format_number(time.perf_counter() - self._start)
+            )
 
     def measure_time(self, label: str):
         if self._timing_on:
             self._add_label_to_timings(label)
-            self._timings[label].append(self._format_number(time.perf_counter()-self._interm))
+            self._timings[label].append(
+                self._format_number(time.perf_counter() - self._interm)
+            )
             self._interm = time.perf_counter()
 
     def print_timings(self, to_file: bool = False):
         print(" ".join(self._timings.keys()))
-        value_array = numpy.array([value for  value in self._timings.values()], dtype=float)
+        value_array = numpy.array(
+            [value for value in self._timings.values()], dtype=float
+        )
         value_array = numpy.transpose(value_array)
         for i in range(value_array.shape[0]):
             print(" ".join(self._format_number(value) for value in value_array[i]))
@@ -106,21 +117,21 @@ def print_timings(self, to_file: bool = False):
             numpy.save("timings.npy", value_array)
             numpy.savetxt("timings.txt", value_array)
 
-
     def run_model(self, model: bytes | str, batch: torch.Tensor):
         tensors = [batch.numpy()]
         self.start_timings(batch.shape[0])
         built_tensor_desc = MessageHandler.build_tensor_descriptor(
-            "c", "float32", list(batch.shape))
+            "c", "float32", list(batch.shape)
+        )
         self.measure_time("build_tensor_descriptor")
         built_model = None
         if isinstance(model, str):
-            model_arg = MessageHandler.build_model_key(model)  # todo: this needs FSD
+            model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor)
         else:
             model_arg = MessageHandler.build_model(model, "resnet-50", "1.0")
         request = MessageHandler.build_request(
             reply_channel=self._from_worker_ch_serialized,
-            model= model_arg,
+            model=model_arg,
             inputs=[built_tensor_desc],
             outputs=[],
             output_descriptors=[],
@@ -129,10 +140,12 @@ def run_model(self, model: bytes | str, batch: torch.Tensor):
         self.measure_time("build_request")
         request_bytes = MessageHandler.serialize_request(request)
         self.measure_time("serialize_request")
-        with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
+        with self._to_worker_fli.sendh(
+            timeout=None, stream_channel=self._to_worker_ch
+        ) as to_sendh:
             to_sendh.send_bytes(request_bytes)
             for t in tensors:
-                to_sendh.send_bytes(t.tobytes()) #TODO NOT FAST ENOUGH!!!
+                to_sendh.send_bytes(t.tobytes())  # TODO NOT FAST ENOUGH!!!
                 # to_sendh.send_bytes(bytes(t.data))
         logger.info(f"Message size: {len(request_bytes)} bytes")
 
@@ -159,7 +172,7 @@ def set_model(self, key: str, model: bytes):
         self._ddict[key] = model
 
 
-class ResNetWrapper():
+class ResNetWrapper:
     def __init__(self, name: str, model: str):
         self._model = torch.jit.load(model)
         self._name = name
@@ -168,7 +181,7 @@ def __init__(self, name: str, model: str):
         torch.jit.save(scripted, buffer)
         self._serialized_model = buffer.getvalue()
 
-    def get_batch(self, batch_size: int=32):
+    def get_batch(self, batch_size: int = 32):
         return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
 
     @property
@@ -179,6 +192,7 @@ def model(self):
     def name(self):
         return self._name
 
+
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser("Mock application")
@@ -194,8 +208,8 @@ def name(self):
 
     for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]:
         logger.info(f"Batch size: {batch_size}")
-        for iteration_number in range(total_iterations + int(batch_size==1)):
+        for iteration_number in range(total_iterations + int(batch_size == 1)):
             logger.info(f"Iteration: {iteration_number}")
             client.run_model(resnet.name, resnet.get_batch(batch_size))
 
-    client.print_timings(to_file=True)
\ No newline at end of file
+    client.print_timings(to_file=True)
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index c56e11a7c..466d2d669 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -31,17 +31,19 @@
 from dragon.data.ddict.ddict import DDict
 from dragon.utils import b64decode, b64encode
 from dragon.globalservices.api_setup import connect_to_infrastructure
+
 # isort: on
 import argparse
 import base64
 import cloudpickle
-import pickle
+import optparse
 import os
 
 from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
-from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import DragonFeatureStore
 from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
-from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
 from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager
 from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader
 
@@ -75,22 +77,22 @@
     to_worker_fli_serialized = to_worker_fli.serialize()
     ddict["to_worker_fli"] = to_worker_fli_serialized
 
-    torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))()
-
-    dfs = DragonFeatureStore(ddict)
-    comm_channel = DragonFLIChannel(to_worker_fli_serialized)
+    worker_type_name = base64.b64decode(args.worker_class.encode("ascii"))
+    torch_worker = cloudpickle.loads(worker_type_name)()
 
-    os.environ["SSFeatureStore"] = base64.b64encode(pickle.dumps(dfs)).decode("utf-8")
     os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
 
-    config_loader = EnvironmentConfigLoader()
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
 
     worker_manager = WorkerManager(
         config_loader=config_loader,
         worker=torch_worker,
         as_service=True,
         cooldown=10,
-        comm_channel_type=DragonCommChannel,
-        device = args.device,
+        device=args.device,
     )
     worker_manager.execute()
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index 28b4c2bf3..503c17ad3 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -30,7 +30,7 @@
 
 # isort: on
 
-import sys
+import base64
 import typing as t
 
 import smartsim._core.mli.comm.channel.channel as cch
@@ -70,3 +70,13 @@ def recv(self) -> t.List[bytes]:
                 except fli.FLIEOT as exc:
                     eot = True
         return messages
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "DragonFLIChannel":
+        return DragonFLIChannel(
+            fli_desc=base64.b64decode(descriptor),
+            sender_supplied=True,
+        )
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index e34f711f5..b7e409e46 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -24,11 +24,11 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import sys
 import time
 import typing as t
 
-from .....error import SmartSimError
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+
 from .....log import get_logger
 from ....entrypoints.service import Service
 from ...comm.channel.channel import CommChannelBase
@@ -41,14 +41,15 @@
     MachineLearningWorkerBase,
 )
 from ...message_handler import MessageHandler
-from ...mli_schemas.response.response_capnp import Response, ResponseBuilder
+from ...mli_schemas.response.response_capnp import ResponseBuilder
 
 if t.TYPE_CHECKING:
     from dragon.fli import FLInterface
 
-    from smartsim._core.mli.mli_schemas.model.model_capnp import Model
+    # from smartsim._core.mli.mli_schemas.model.model_capnp import Model
     from smartsim._core.mli.mli_schemas.response.response_capnp import Status
-    from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor
+
+    # from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor
 
 logger = get_logger(__name__)
 
@@ -95,9 +96,10 @@ def __init__(
         self,
         config_loader: EnvironmentConfigLoader,
         worker: MachineLearningWorkerBase,
+        # fs_factory: t.Callable[[str], FeatureStore],
         as_service: bool = False,
         cooldown: int = 0,
-        comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
+        # comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
         device: t.Literal["cpu", "gpu"] = "cpu",
     ) -> None:
         """Initialize the WorkerManager
@@ -115,14 +117,18 @@ def __init__(
         """the queue the manager monitors for new tasks"""
         self._worker = worker
         """The ML Worker implementation"""
-        self._comm_channel_type = comm_channel_type
+        self._callback_factory = config_loader._callback_factory
         """The type of communication channel to construct for callbacks"""
         self._device = device
         """Device on which workers need to run"""
         self._cached_models: dict[str, t.Any] = {}
         """Dictionary of previously loaded models"""
-        self._feature_stores = config_loader.get_feature_stores()
+        self._feature_stores: t.Dict[str, FeatureStore] = {}
         """A collection of attached feature stores"""
+        self._fs_factory = config_loader._featurestore_factory
+        """A factory method to create a desired feature store client type"""
+        self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone()
+        """The backbone feature store"""
 
     def _check_feature_stores(self, request: InferenceRequest) -> bool:
         """Ensures that all feature stores required by the request are available
@@ -139,11 +145,17 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool:
         fs_actual = {item.descriptor for item in self._feature_stores.values()}
         fs_missing = fs_desired - fs_actual
 
-        # exit if all desired feature stores are not available
-        if fs_missing:
-            logger.error(f"Missing feature store(s): {fs_missing}")
+        if self._fs_factory is None:
+            logger.warning("No feature store factory configured")
             return False
 
+        # create the feature stores we need to service request
+        if fs_missing:
+            logger.info(f"Missing feature store(s): {fs_missing}")
+            for descriptor in fs_missing:
+                feature_store = self._fs_factory(descriptor)
+                self._feature_stores[descriptor] = feature_store
+
         return True
 
     def _check_model(self, request: InferenceRequest) -> bool:
@@ -212,7 +224,7 @@ def _on_iteration(self) -> None:
 
         interm = time.perf_counter()  # timing
         request = self._worker.deserialize_message(
-            request_bytes, self._comm_channel_type
+            request_bytes, self._callback_factory
         )
 
         if request.input_meta and tensor_bytes_list:
@@ -234,6 +246,12 @@ def _on_iteration(self) -> None:
                     "Could not find model key or model.",
                 )
                 return
+
+            # if request.model_key.descriptor not in self._feature_stores:
+            #     self._fs_factory(request.model_key.descriptor)
+            # todo: decide if we should load here or in _check_feature_stores.
+            # todo: should i raise error here?
+
             if request.model_key.key in self._cached_models:
                 timings.append(time.perf_counter() - interm)  # timing
                 interm = time.perf_counter()  # timing
@@ -265,7 +283,9 @@ def _on_iteration(self) -> None:
                     self._cached_models[request.model_key.key] = model_result.model
                 except Exception as e:
                     exception_handler(
-                        e, request.callback, "Failed while loading the model."
+                        e,
+                        request.callback,
+                        "Failed while loading model from feature store.",
                     )
                     return
 
@@ -290,7 +310,9 @@ def _on_iteration(self) -> None:
                 )
             except Exception as e:
                 exception_handler(
-                    e, request.callback, "Failed while loading the model."
+                    e,
+                    request.callback,
+                    "Failed while loading model from feature store.",
                 )
                 return
 
diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index 983afc00c..ec38a56dd 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -24,16 +24,11 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import base64
 import os
-import pickle
 import typing as t
 
-from dragon.fli import FLInterface  # pylint: disable=all
-
-from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
-from smartsim.error.errors import SmartSimError
 from smartsim.log import get_logger
 
 logger = get_logger(__name__)
@@ -45,45 +40,55 @@ class EnvironmentConfigLoader:
     into the WorkerManager.
     """
 
-    def __init__(self) -> None:
+    def __init__(
+        self,
+        featurestore_factory: t.Callable[[str], FeatureStore],
+        callback_factory: t.Callable[[bytes], CommChannelBase],
+        queue_factory: t.Callable[[str], CommChannelBase],
+    ) -> None:
         self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None)
-        self.feature_stores: t.Optional[t.Dict[str, FeatureStore]] = None
-        self.queue: t.Optional[DragonFLIChannel] = None
-        self._feature_store_prefix = "SSFeatureStore"
-
-    def _load_feature_store(self, env_var: str) -> FeatureStore:
-        """Load a feature store from a descriptor
-        :param descriptor: The descriptor of the feature store
-        :returns: The hydrated feature store"""
-        logger.debug(f"Loading feature store from env: {env_var}")
+        """The descriptor used to attach to the incoming event queue"""
+        self.queue: t.Optional[CommChannelBase] = None
+        """The attached incoming event queue channel"""
+        self._backbone_descriptor: t.Optional[str] = os.getenv("SS_DRG_DDICT", None)
+        """The descriptor used to attach to the backbone feature store"""
+        self.backbone: t.Optional[FeatureStore] = None
+        """The attached backbone feature store"""
+        self._featurestore_factory = featurestore_factory
+        """A factory method to instantiate a FeatureStore"""
+        self._callback_factory = callback_factory
+        """A factory method to instantiate a concrete CommChannelBase
+        for inference callbacks"""
+        self._queue_factory = queue_factory
+        """A factory method to instantiate a concrete CommChannelBase
+        for inference requests"""
 
-        value = os.getenv(env_var)
-        if not value:
-            raise SmartSimError(
-                f"Empty feature store descriptor in environment: {env_var}"
-            )
+    def get_backbone(self) -> t.Optional[FeatureStore]:
+        """Create the backbone feature store using the descriptor found in
+        an environment variable"""
+        descriptor = self._backbone_descriptor or os.getenv("SS_DRG_DDICT", None)
+        if self._featurestore_factory is None:
+            logger.warning("No feature store factory is configured")
+            return None
 
-        try:
-            return t.cast(FeatureStore, pickle.loads(base64.b64decode(value)))
-        except:
-            raise SmartSimError(
-                f"Invalid feature store descriptor in environment: {env_var}"
-            )
+        if descriptor is not None:
+            self.backbone = self._featurestore_factory(descriptor)
+            self._backbone_descriptor = descriptor
+        return self.backbone
 
-    def get_feature_stores(self) -> t.Dict[str, FeatureStore]:
-        """Loads multiple Feature Stores by scanning environment for variables
-        prefixed with `SSFeatureStore`"""
-        if not self.feature_stores:
-            env_vars = [var for var in os.environ if var.startswith(self._feature_store_prefix)]
-            stores = [self._load_feature_store(var) for var in env_vars]
-            self.feature_stores = {fs.descriptor: fs for fs in stores}
-        return self.feature_stores
-
-    def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]:
+    def get_queue(self) -> t.Optional[CommChannelBase]:
         """Returns the Queue previously set in SSQueue"""
-        if self._queue_descriptor is not None:
-            self.queue = DragonFLIChannel(
-                fli_desc=base64.b64decode(self._queue_descriptor),
-                sender_supplied=sender_supplied,
-            )
+        descriptor = self._queue_descriptor or os.getenv("SSQueue", None)
+        if self._queue_factory is None:
+            logger.warning("No queue factory is configured")
+            return None
+
+        if descriptor is not None:
+            # , sender_supplied: bool = True
+            # self.queue = DragonFLIChannel(
+            #     fli_desc=base64.b64decode(descriptor),
+            #     sender_supplied=sender_supplied,
+            # )
+            self.queue = self._queue_factory(descriptor)
+            self._queue_descriptor = descriptor
         return self.queue
diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
index d5344198a..213d29cf4 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
@@ -26,13 +26,15 @@
 
 import typing as t
 
-import smartsim.error as sse
-from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
-from smartsim.log import get_logger
+# pylint: disable=import-error
+# isort: off
+import dragon.data.ddict.ddict as dragon_ddict
 
-if t.TYPE_CHECKING:
-    from dragon.data.ddict.ddict import DDict
+# isort: on
 
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim.error import SmartSimError
+from smartsim.log import get_logger
 
 logger = get_logger(__name__)
 
@@ -40,7 +42,7 @@
 class DragonFeatureStore(FeatureStore):
     """A feature store backed by a dragon distributed dictionary"""
 
-    def __init__(self, storage: "DDict") -> None:
+    def __init__(self, storage: "dragon_ddict.DDict") -> None:
         """Initialize the DragonFeatureStore instance"""
         self._storage = storage
 
@@ -54,7 +56,7 @@ def __getitem__(self, key: str) -> t.Union[str, bytes]:
             raise ex
         except Exception as ex:
             # note: explicitly avoid round-trip to check for key existence
-            raise sse.SmartSimError(
+            raise SmartSimError(
                 f"Could not get value for existing key {key}, error:\n{ex}"
             ) from ex
 
@@ -76,3 +78,25 @@ def descriptor(self) -> str:
         the feature store
         :returns: A descriptor encoded as a string"""
         return str(self._storage.serialize())
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+        # b64encoded: bool = False,
+    ) -> "DragonFeatureStore":
+        # import dragon.data.ddict.ddict as dragon_ddict  # pylint: disable=import-outside-toplevel
+
+        # # if b64encoded:
+        # #     descriptor = base64.b64decode(descriptor).encode("utf-8")
+        # # ddict = DDict.attach(descriptor)
+        # # ddict.attach(descriptor)
+
+        # storage = dragon_ddict.DDict()
+        # storage.attach(descriptor)
+        # return DragonFeatureStore(storage)
+
+        if descriptor is None:
+            print("foo")
+            return None
+        return DragonFeatureStore({"tmp": "here"})
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index 984fd10df..f1d0775f0 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -159,7 +159,7 @@ class MachineLearningWorkerCore:
     @staticmethod
     def deserialize_message(
         data_blob: bytes,
-        channel_type: t.Type[CommChannelBase],
+        callback_factory: t.Callable[[bytes], CommChannelBase],
     ) -> InferenceRequest:
         """Deserialize a message from a byte stream into an InferenceRequest
         :param data_blob: The byte stream to deserialize
@@ -179,7 +179,7 @@ def deserialize_message(
             model_bytes = request.model.data
 
         callback_key = request.replyChannel.descriptor
-        comm_channel = channel_type(callback_key)
+        comm_channel = callback_factory(callback_key)
         input_keys: t.Optional[t.List[FeatureStoreKey]] = None
         input_bytes: t.Optional[t.List[bytes]] = None
         output_keys: t.Optional[t.List[FeatureStoreKey]] = None
diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py
index cb5e56df2..ee632e24e 100644
--- a/smartsim/_core/mli/message_handler.py
+++ b/smartsim/_core/mli/message_handler.py
@@ -226,10 +226,8 @@ def _assign_model(
             elif class_name == "ModelKey":
                 request.model.key = model  # type: ignore
             else:
-                raise ValueError(
-                    """Invalid custom attribute class name.
-                        Expected 'Model' or 'ModelKey'."""
-                )
+                raise ValueError("""Invalid custom attribute class name.
+                        Expected 'Model' or 'ModelKey'.""")
         except Exception as e:
             raise ValueError("Error building model portion of request.") from e
 
@@ -273,10 +271,8 @@ def _assign_inputs(
                 elif input_class_name == "TensorKey":
                     request.input.keys = inputs  # type: ignore
                 else:
-                    raise ValueError(
-                        """Invalid input class name. Expected
-                        'TensorDescriptor' or 'TensorKey'."""
-                    )
+                    raise ValueError("""Invalid input class name. Expected
+                        'TensorDescriptor' or 'TensorKey'.""")
         except Exception as e:
             raise ValueError("Error building inputs portion of request.") from e
 
@@ -345,11 +341,9 @@ def _assign_custom_request_attributes(
                 elif custom_attribute_class_name == "TensorFlowRequestAttributes":
                     request.customAttributes.tf = custom_attrs  # type: ignore
                 else:
-                    raise ValueError(
-                        """Invalid custom attribute class name.
+                    raise ValueError("""Invalid custom attribute class name.
                         Expected 'TensorFlowRequestAttributes' or
-                        'TorchRequestAttributes'."""
-                    )
+                        'TorchRequestAttributes'.""")
         except Exception as e:
             raise ValueError(
                 "Error building custom attributes portion of request."
@@ -469,10 +463,8 @@ def _assign_result(
                 elif result_class_name == "TensorKey":
                     response.result.keys = result  # type: ignore
                 else:
-                    raise ValueError(
-                        """Invalid custom attribute class name.
-                        Expected 'TensorDescriptor' or 'TensorKey'."""
-                    )
+                    raise ValueError("""Invalid custom attribute class name.
+                        Expected 'TensorDescriptor' or 'TensorKey'.""")
         except Exception as e:
             raise ValueError("Error assigning result to response.") from e
 
@@ -504,11 +496,9 @@ def _assign_custom_response_attributes(
                 elif custom_attribute_class_name == "TensorFlowResponseAttributes":
                     response.customAttributes.tf = custom_attrs  # type: ignore
                 else:
-                    raise ValueError(
-                        """Invalid custom attribute class name.
+                    raise ValueError("""Invalid custom attribute class name.
                         Expected 'TensorFlowResponseAttributes' or
-                        'TorchResponseAttributes'."""
-                    )
+                        'TorchResponseAttributes'.""")
         except Exception as e:
             raise ValueError("Error assigning custom attributes to response.") from e
 
diff --git a/tests/dragon/featurestore.py b/tests/dragon/featurestore.py
index f9d4a1da2..a249620fb 100644
--- a/tests/dragon/featurestore.py
+++ b/tests/dragon/featurestore.py
@@ -115,7 +115,24 @@ def descriptor(self) -> str:
         """Return a unique identifier enabling a client to connect to
         the feature store
         :returns: A descriptor encoded as a string"""
-        return "in-memory-fs"
+        if not self._storage_dir:
+            raise ValueError("No storage path configured")
+        return self._storage_dir.as_posix()
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+        # b64encoded: bool = False,
+    ) -> "FileSystemFeatureStore":
+        # if b64encoded:
+        #     descriptor = base64.b64decode(descriptor).encode("utf-8")
+        path = pathlib.Path(descriptor)
+        if not path.is_dir():
+            raise ValueError("FileSystemFeatureStore requires a directory path")
+        if not path.exists():
+            path.mkdir(parents=True, exist_ok=True)
+        return FileSystemFeatureStore(path)
 
 
 class DragonDict:
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index d4adb3587..12893d3b2 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -24,15 +24,11 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import base64
-import os
-import pickle
-import typing as t
+import pathlib
 
 import pytest
 
-from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
-from smartsim.error.errors import SmartSimError
+from tests.mli.channel import FileSystemCommChannel
 
 dragon = pytest.importorskip("dragon")
 
@@ -46,7 +42,7 @@
     DragonFeatureStore,
 )
 
-from .featurestore import FileSystemFeatureStore, MemoryFeatureStore
+from .featurestore import FileSystemFeatureStore
 
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
@@ -59,13 +55,17 @@
         pytest.param(b"new byte string"),
     ],
 )
-def test_environment_loader_attach_FLI(content, monkeypatch):
+def test_environment_loader_attach_FLI(content: bytes, monkeypatch: pytest.MonkeyPatch):
     """A descriptor can be stored, loaded, and reattached"""
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize()))
 
-    config = EnvironmentConfigLoader()
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=FileSystemCommChannel.from_descriptor,
+    )
     config_queue = config.get_queue()
 
     new_sender = config_queue.send(content)
@@ -75,92 +75,78 @@ def test_environment_loader_attach_FLI(content, monkeypatch):
     assert result == content
 
 
-def test_environment_loader_serialize_FLI(monkeypatch):
+def test_environment_loader_serialize_FLI(monkeypatch: pytest.MonkeyPatch):
     """The serialized descriptors of a loaded and unloaded
     queue are the same"""
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize()))
 
-    config = EnvironmentConfigLoader()
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=FileSystemCommChannel.from_descriptor,
+    )
     config_queue = config.get_queue()
     assert config_queue._fli.serialize() == queue.serialize()
 
 
-def test_environment_loader_FLI_fails(monkeypatch):
+def test_environment_loader_FLI_fails(monkeypatch: pytest.MonkeyPatch):
     """An incorrect serialized descriptor will fails to attach"""
     monkeypatch.setenv("SSQueue", "randomstring")
-    config = EnvironmentConfigLoader()
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=FileSystemCommChannel.from_descriptor,
+    )
 
     with pytest.raises(DragonFLIError):
         config_queue = config.get_queue()
 
 
-@pytest.mark.parametrize(
-    "feature_stores",
-    [
-        pytest.param([], id="No feature stores"),
-        pytest.param([MemoryFeatureStore()], id="Single feature store"),
-        pytest.param(
-            [MemoryFeatureStore(), FileSystemFeatureStore()],
-            id="Multiple feature stores",
-        ),
-    ],
-)
-def test_environment_loader_featurestores(
-    feature_stores: t.List[FeatureStore], monkeypatch: pytest.MonkeyPatch
+def test_environment_loader_backbone_load_fs(
+    monkeypatch: pytest.MonkeyPatch, test_dir: str
 ):
-    """FeatureStore can be correctly identified, serialized and deserialized"""
-    with monkeypatch.context() as m:
-        for fs in feature_stores:
-            value = base64.b64encode(pickle.dumps(fs)).decode("utf-8")
-            key = f"SSFeatureStore.{fs.descriptor}"
-            m.setenv(key, value)
-
-        config = EnvironmentConfigLoader()
-        actual_feature_stores = config.get_feature_stores()
+    """Verify the file system feature store is loaded correctly by
+    the EnvironmentConfigLoader to demonstrate fs_factory correctness"""
+    fs = FileSystemFeatureStore(pathlib.Path(test_dir))
+    monkeypatch.setenv("SS_DRG_DDICT", fs.descriptor)
 
-        for fs in feature_stores:
-            # Confirm that the descriptors were used as keys in the loaded feature stores
-            assert fs.descriptor in actual_feature_stores
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=FileSystemCommChannel.from_descriptor,
+    )
 
-            # Confirm that the value loaded from env var is a FeatureStore
-            # and it is consistent w/the key identifying it
-            loaded_fs = actual_feature_stores[fs.descriptor]
-            assert loaded_fs.descriptor == fs.descriptor
+    backbone = config.get_backbone()
+    assert backbone is not None
 
 
-@pytest.mark.parametrize(
-    "value_to_use,error_filter",
-    [
-        pytest.param("", "empty", id="Empty value"),
-        pytest.param("abcd", "invalid", id="Incorrectly serialized value"),
-    ],
-)
-def test_environment_loader_featurestores_errors(
-    value_to_use: str, error_filter: str, monkeypatch: pytest.MonkeyPatch
+def test_environment_loader_backbone_load_dfs(
+    monkeypatch: pytest.MonkeyPatch, test_dir: str
 ):
-    """Verify that the environment loader reports an error when a feature store
-    env var is populated with something that cannot be loaded properly"""
-
-    fs = FileSystemFeatureStore()  # just use for descriptor...
-    key = f"SSFeatureStore.{fs.descriptor}"
-
-    with monkeypatch.context() as m, pytest.raises(SmartSimError) as ex:
-        m.setenv(key, value_to_use)  # <----- simulate incorrect value in env var
+    """Verify the dragon feature store is loaded correctly by
+    the EnvironmentConfigLoader to demonstrate fs_factory correctness"""
+    fs = DragonFeatureStore(DDict())
+    monkeypatch.setenv("SS_DRG_DDICT", fs.descriptor)
 
-        config = EnvironmentConfigLoader()
-        config.get_feature_stores()  # <---- kick off validation
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=FileSystemCommChannel.from_descriptor,
+    )
 
-    # confirm the specific key is reported in error message
-    assert key in ex.value.args[0]
-    # ensure the failure occurred during loading
-    assert error_filter in ex.value.args[0].lower()
+    backbone = config.get_backbone()
+    assert backbone is not None
 
 
 def test_environment_variables_not_set():
     """EnvironmentConfigLoader getters return None when environment
     variables are not set"""
-    config = EnvironmentConfigLoader()
-    assert config.get_feature_stores() == {}
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=FileSystemCommChannel.from_descriptor,
+    )
+    assert config.get_backbone() == None
     assert config.get_queue() == None
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index c929c2341..3231b4af2 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -24,12 +24,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import base64
-import pickle
 from unittest.mock import MagicMock
 
 import pytest
 
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+
 dragon = pytest.importorskip("dragon")
 
 import dragon.utils as du
@@ -45,6 +45,7 @@
 from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
     DragonFeatureStore,
 )
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
 from smartsim._core.mli.infrastructure.worker.worker import (
     ExecuteResult,
     FetchInputResult,
@@ -64,30 +65,51 @@
 
 
 @pytest.fixture
-def setup_worker_manager_model_bytes(test_dir, monkeypatch: pytest.MonkeyPatch):
+def backbone_descriptor() -> str:
+    # create a shared backbone featurestore
+    feature_store = DragonFeatureStore(DDict())
+    return feature_store.descriptor
+
+
+@pytest.fixture
+def app_feature_store() -> FeatureStore:
+    # create a standalone feature store to mimic a user application putting
+    # data into an application-owned resource (app should not access backbone)
+    app_fs = DragonFeatureStore(DDict())
+    return app_fs
+
+
+@pytest.fixture
+def setup_worker_manager_model_bytes(
+    test_dir,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+):
     integrated_worker = IntegratedTorchWorker()
 
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize()))
-    storage = DDict()
-    feature_store = DragonFeatureStore(storage)
-    monkeypatch.setenv(
-        "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8")
-    )
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv("SS_DRG_DDICT", backbone_descriptor)
 
     worker_manager = WorkerManager(
-        EnvironmentConfigLoader(),
+        EnvironmentConfigLoader(
+            featurestore_factory=DragonFeatureStore.from_descriptor,
+            callback_factory=FileSystemCommChannel.from_descriptor,
+            queue_factory=DragonFLIChannel.from_descriptor,
+        ),
         integrated_worker,
         as_service=False,
         cooldown=3,
-        comm_channel_type=FileSystemCommChannel,
     )
 
-    tensor_key = MessageHandler.build_tensor_key("key", feature_store.descriptor)
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", f"{test_dir}/out")
     model = MessageHandler.build_model(b"model", "model name", "v 0.0.1")
     request = MessageHandler.build_request(
-        test_dir, model, [tensor_key], [tensor_key], [], None
+        test_dir, model, [tensor_key], [output_key], [], None
     )
     ser_request = MessageHandler.serialize_request(request)
     worker_manager._task_queue.send(ser_request)
@@ -96,30 +118,38 @@ def setup_worker_manager_model_bytes(test_dir, monkeypatch: pytest.MonkeyPatch):
 
 
 @pytest.fixture
-def setup_worker_manager_model_key(test_dir, monkeypatch: pytest.MonkeyPatch):
+def setup_worker_manager_model_key(
+    test_dir: str,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+):
     integrated_worker = IntegratedTorchWorker()
 
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize()))
-    storage = DDict()
-    feature_store = DragonFeatureStore(storage)
-    monkeypatch.setenv(
-        "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8")
-    )
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv("SS_DRG_DDICT", backbone_descriptor)
 
     worker_manager = WorkerManager(
-        EnvironmentConfigLoader(),
+        EnvironmentConfigLoader(
+            featurestore_factory=DragonFeatureStore.from_descriptor,
+            callback_factory=FileSystemCommChannel.from_descriptor,
+            queue_factory=DragonFLIChannel.from_descriptor,
+        ),
         integrated_worker,
         as_service=False,
         cooldown=3,
-        comm_channel_type=FileSystemCommChannel,
     )
 
-    tensor_key = MessageHandler.build_tensor_key("key", feature_store.descriptor)
-    model_key = MessageHandler.build_model_key("model key", feature_store.descriptor)
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", f"{test_dir}/out")
+    model_key = MessageHandler.build_model_key(
+        "model key", app_feature_store.descriptor
+    )
     request = MessageHandler.build_request(
-        test_dir, model_key, [tensor_key], [tensor_key], [], None
+        test_dir, model_key, [tensor_key], [output_key], [], None
     )
     ser_request = MessageHandler.serialize_request(request)
     worker_manager._task_queue.send(ser_request)
@@ -162,7 +192,11 @@ def mock_exception_handler(exc, reply_channel, failure_message):
         pytest.param(
             "fetch_model", "Failed while fetching the model.", id="fetch model"
         ),
-        pytest.param("load_model", "Failed while loading the model.", id="load model"),
+        pytest.param(
+            "load_model",
+            "Failed while loading model from feature store.",
+            id="load model",
+        ),
         pytest.param(
             "fetch_inputs", "Failed while fetching the inputs.", id="fetch inputs"
         ),
diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py
index df76c484b..4c677eb4c 100644
--- a/tests/dragon/utils/channel.py
+++ b/tests/dragon/utils/channel.py
@@ -62,3 +62,14 @@ def recv(self) -> bytes:
         """Receieve a message through the underlying communication channel
         :returns: the received message"""
         ...
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: t.Union[str, bytes],
+    ) -> "FileSystemCommChannel":
+        if isinstance(descriptor, str):
+            path = pathlib.Path(descriptor)
+        else:
+            path = pathlib.Path(descriptor.decode("utf-8"))
+        return FileSystemCommChannel(path)
diff --git a/tests/dragon/utils/worker.py b/tests/dragon/utils/worker.py
index b1de28018..f6c8120e0 100644
--- a/tests/dragon/utils/worker.py
+++ b/tests/dragon/utils/worker.py
@@ -47,7 +47,7 @@ class IntegratedTorchWorker(mliw.MachineLearningWorkerBase):
 
     @staticmethod
     def load_model(
-        request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult
+        request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult, device: str
     ) -> mliw.LoadModelResult:
         model_bytes = fetch_result.model_bytes or request.raw_model
         if not model_bytes:
@@ -61,6 +61,7 @@ def load_model(
     def transform_input(
         request: mliw.InferenceRequest,
         fetch_result: mliw.FetchInputResult,
+        device: str,
     ) -> mliw.TransformInputResult:
         # extra metadata for assembly can be found in request.input_meta
         raw_inputs = request.raw_inputs or fetch_result.inputs
@@ -93,6 +94,7 @@ def execute(
     def transform_output(
         request: mliw.InferenceRequest,
         execute_result: mliw.ExecuteResult,
+        result_device: str,
     ) -> mliw.TransformOutputResult:
         # transformed = [item.clone() for item in execute_result.predictions]
         # return OutputTransformResult(transformed)
diff --git a/tests/mli/channel.py b/tests/mli/channel.py
index 4bc2014ea..743a21595 100644
--- a/tests/mli/channel.py
+++ b/tests/mli/channel.py
@@ -57,3 +57,16 @@ def send(self, value: bytes) -> None:
             f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}"
         )
         self._file_path.write_bytes(value)
+
+    def recv(self) -> t.List[bytes]:
+        """Receieve a message through the underlying communication channel
+        :returns: the received message"""
+        self._file_path.read_bytes()
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "FileSystemCommChannel":
+        path = pathlib.Path(descriptor)
+        return FileSystemCommChannel(path)
diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py
index f9d4a1da2..c15a20a34 100644
--- a/tests/mli/featurestore.py
+++ b/tests/mli/featurestore.py
@@ -62,7 +62,7 @@ def descriptor(self) -> str:
         """Return a unique identifier enabling a client to connect to
         the feature store
         :returns: A descriptor encoded as a string"""
-        return "file-system-fs"
+        return "in-memory-fs"
 
 
 class FileSystemFeatureStore(FeatureStore):
@@ -115,7 +115,24 @@ def descriptor(self) -> str:
         """Return a unique identifier enabling a client to connect to
         the feature store
         :returns: A descriptor encoded as a string"""
-        return "in-memory-fs"
+        if not self._storage_dir:
+            raise ValueError("No storage path configured")
+        return self._storage_dir.as_posix()
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+        # b64encoded: bool = False,
+    ) -> "FileSystemFeatureStore":
+        # if b64encoded:
+        #     descriptor = base64.b64decode(descriptor).encode("utf-8")
+        path = pathlib.Path(descriptor)
+        if not path.is_dir():
+            raise ValueError("FileSystemFeatureStore requires a directory path")
+        if not path.exists():
+            path.mkdir(parents=True, exist_ok=True)
+        return FileSystemFeatureStore(path)
 
 
 class DragonDict:
diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index dc4c026c0..f48395a76 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -32,50 +32,38 @@
 
 import pytest
 
-from tests.mli.featurestore import FileSystemFeatureStore
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 
 torch = pytest.importorskip("torch")
 dragon = pytest.importorskip("dragon")
 
+import base64
+import os
+
+import dragon.channels as dch
+from dragon import fli
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.infrastructure.control.workermanager import (
     EnvironmentConfigLoader,
     WorkerManager,
 )
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim._core.mli.message_handler import MessageHandler
 from smartsim.log import get_logger
+from tests.mli.featurestore import FileSystemFeatureStore
 
 from .channel import FileSystemCommChannel
-from .worker import IntegratedTorchWorker
 
 logger = get_logger(__name__)
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
 
 
-def mock_work(worker_manager_queue: "mp.Queue[bytes]") -> None:
-    """Mock event producer for triggering the inference pipeline"""
-    # todo: move to unit tests
-    while True:
-        time.sleep(1)
-        # 1. for demo, ignore upstream and just put stuff into downstream
-        # 2. for demo, only one downstream but we'd normally have to filter
-        #       msg content and send to the correct downstream (worker) queue
-        timestamp = time.time_ns()
-        output_dir = "/lus/bnchlu1/mcbridch/code/ss/_tmp"
-        output_path = pathlib.Path(output_dir)
-
-        mock_channel = output_path / f"brainstorm-{timestamp}.txt"
-        mock_model = output_path / "brainstorm.pt"
-
-        output_path.mkdir(parents=True, exist_ok=True)
-        mock_channel.touch()
-        mock_model.touch()
-
-        msg = f"PyTorch:{mock_model}:MockInputToReplace:{mock_channel}"
-        worker_manager_queue.put(msg.encode("utf-8"))
-
-
 def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
     """Create a simple torch model and persist to disk for
     testing purposes.
@@ -95,7 +83,7 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
 
 
 def mock_messages(
-    worker_manager_queue: "mp.Queue[bytes]",
+    worker_manager_queue: CommChannelBase,
     feature_store: FeatureStore,
     feature_store_root_dir: pathlib.Path,
     comm_channel_root_dir: pathlib.Path,
@@ -140,7 +128,7 @@ def mock_messages(
         tensor = torch.randn((1, 2), dtype=torch.float32)
         torch.save(tensor, buffer)
         feature_store[input_key] = buffer.getvalue()
-        fsd = feature_store.descriptor()
+        fsd = feature_store.descriptor
 
         message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
         message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd)
@@ -155,7 +143,7 @@ def mock_messages(
             custom_attributes=None,
         )
         request_bytes = MessageHandler.serialize_request(request)
-        worker_manager_queue.put(request_bytes)
+        worker_manager_queue.send(request_bytes)
 
 
 @pytest.fixture
@@ -173,22 +161,42 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
     fs_path = test_path / "feature_store"
     comm_path = test_path / "comm_store"
 
-    config_loader = EnvironmentConfigLoader()
-    integrated_worker = IntegratedTorchWorker()
+    to_worker_channel = dch.Channel.make_process_local()
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    to_worker_fli_serialized = to_worker_fli.serialize()
+
+    # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
+    # or test environment may be unable to send messages w/queue
+    os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    integrated_worker = TorchWorker()
 
     worker_manager = WorkerManager(
         config_loader,
         integrated_worker,
         as_service=True,
-        cooldown=10,
-        comm_channel_type=FileSystemCommChannel,
+        cooldown=5,
+        # comm_channel_type=FileSystemCommChannel,
+        # featurestore_factory=FileSystemFeatureStore.from_descriptor,
+        device="cpu",
     )
 
+    worker_queue = config_loader.get_queue()
+    if worker_queue is None:
+        logger.warn(
+            f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}"
+        )
+
     # create a mock client application to populate the request queue
     msg_pump = mp.Process(
         target=mock_messages,
         args=(
-            config_loader.get_queue(),
+            worker_queue,
             FileSystemFeatureStore(fs_path),
             fs_path,
             comm_path,
diff --git a/tests/mli/worker.py b/tests/mli/worker.py
index b1de28018..f6c8120e0 100644
--- a/tests/mli/worker.py
+++ b/tests/mli/worker.py
@@ -47,7 +47,7 @@ class IntegratedTorchWorker(mliw.MachineLearningWorkerBase):
 
     @staticmethod
     def load_model(
-        request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult
+        request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult, device: str
     ) -> mliw.LoadModelResult:
         model_bytes = fetch_result.model_bytes or request.raw_model
         if not model_bytes:
@@ -61,6 +61,7 @@ def load_model(
     def transform_input(
         request: mliw.InferenceRequest,
         fetch_result: mliw.FetchInputResult,
+        device: str,
     ) -> mliw.TransformInputResult:
         # extra metadata for assembly can be found in request.input_meta
         raw_inputs = request.raw_inputs or fetch_result.inputs
@@ -93,6 +94,7 @@ def execute(
     def transform_output(
         request: mliw.InferenceRequest,
         execute_result: mliw.ExecuteResult,
+        result_device: str,
     ) -> mliw.TransformOutputResult:
         # transformed = [item.clone() for item in execute_result.predictions]
         # return OutputTransformResult(transformed)

From 24df7da7574dbe4af8479a4b5bcabb2c92254cbf Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Mon, 29 Jul 2024 09:32:31 -0500
Subject: [PATCH 05/49] add missing from_descriptor methods,

---
 .../_core/mli/comm/channel/dragonchannel.py   | 12 +++++
 smartsim/_core/mli/comm/channel/dragonfli.py  | 12 +++--
 .../mli/infrastructure/environmentloader.py   |  7 +--
 .../storage/dragonfeaturestore.py             | 19 ++------
 .../infrastructure/storage/featurestore.py    |  4 ++
 tests/dragon/featurestore.py                  | 48 +++++++------------
 tests/dragon/utils/channel.py                 | 25 +++++++---
 tests/mli/channel.py                          | 21 ++++++--
 tests/mli/featurestore.py                     | 48 +++++++------------
 9 files changed, 100 insertions(+), 96 deletions(-)

diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index 672fce75b..c52c9f68c 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import base64
 import sys
 import typing as t
 
@@ -59,3 +60,14 @@ def recv(self) -> t.List[bytes]:
         with self._channel.recvh(timeout=None) as recvh:
             message_bytes: bytes = recvh.recv_bytes(timeout=None)
             return [message_bytes]
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "DragonCommChannel":
+        try:
+            return DragonCommChannel(base64.b64decode(descriptor))
+        except:
+            print(f"failed to create dragon comm channel: {descriptor}")
+            raise
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index 503c17ad3..84f49fd52 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -76,7 +76,11 @@ def from_descriptor(
         cls,
         descriptor: str,
     ) -> "DragonFLIChannel":
-        return DragonFLIChannel(
-            fli_desc=base64.b64decode(descriptor),
-            sender_supplied=True,
-        )
+        try:
+            return DragonFLIChannel(
+                fli_desc=base64.b64decode(descriptor),
+                sender_supplied=True,
+            )
+        except:
+            logger.error(f"Error while creating DragonFLIChannel: {descriptor}")
+            raise
diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index ec38a56dd..3c64fffe9 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -83,12 +83,7 @@ def get_queue(self) -> t.Optional[CommChannelBase]:
             logger.warning("No queue factory is configured")
             return None
 
-        if descriptor is not None:
-            # , sender_supplied: bool = True
-            # self.queue = DragonFLIChannel(
-            #     fli_desc=base64.b64decode(descriptor),
-            #     sender_supplied=sender_supplied,
-            # )
+        if descriptor is not None and descriptor:
             self.queue = self._queue_factory(descriptor)
             self._queue_descriptor = descriptor
         return self.queue
diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
index 213d29cf4..65ebd57b7 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
@@ -85,18 +85,9 @@ def from_descriptor(
         descriptor: str,
         # b64encoded: bool = False,
     ) -> "DragonFeatureStore":
-        # import dragon.data.ddict.ddict as dragon_ddict  # pylint: disable=import-outside-toplevel
 
-        # # if b64encoded:
-        # #     descriptor = base64.b64decode(descriptor).encode("utf-8")
-        # # ddict = DDict.attach(descriptor)
-        # # ddict.attach(descriptor)
-
-        # storage = dragon_ddict.DDict()
-        # storage.attach(descriptor)
-        # return DragonFeatureStore(storage)
-
-        if descriptor is None:
-            print("foo")
-            return None
-        return DragonFeatureStore({"tmp": "here"})
+        try:
+            return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor))
+        except:
+            print(f"error creating dragon feature store: {descriptor}")
+            raise
diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py
index 49f16af8a..4531f6696 100644
--- a/smartsim/_core/mli/infrastructure/storage/featurestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py
@@ -29,6 +29,10 @@
 
 from pydantic import BaseModel, Field
 
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
 
 class FeatureStoreKey(BaseModel):
     """A key,descriptor pair enabling retrieval of an item from a feature store"""
diff --git a/tests/dragon/featurestore.py b/tests/dragon/featurestore.py
index a249620fb..352cd8661 100644
--- a/tests/dragon/featurestore.py
+++ b/tests/dragon/featurestore.py
@@ -29,6 +29,9 @@
 
 import smartsim.error as sse
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
 
 
 class MemoryFeatureStore(FeatureStore):
@@ -69,9 +72,13 @@ class FileSystemFeatureStore(FeatureStore):
     """Alternative feature store implementation for testing. Stores all
     data on the file system"""
 
-    def __init__(self, storage_dir: t.Optional[pathlib.Path] = None) -> None:
+    def __init__(
+        self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None
+    ) -> None:
         """Initialize the FileSystemFeatureStore instance
         :param storage_dir: (optional) root directory to store all data relative to"""
+        if isinstance(storage_dir, str):
+            storage_dir = pathlib.Path(storage_dir)
         self._storage_dir = storage_dir
 
     def __getitem__(self, key: str) -> bytes:
@@ -127,33 +134,14 @@ def from_descriptor(
     ) -> "FileSystemFeatureStore":
         # if b64encoded:
         #     descriptor = base64.b64decode(descriptor).encode("utf-8")
-        path = pathlib.Path(descriptor)
-        if not path.is_dir():
-            raise ValueError("FileSystemFeatureStore requires a directory path")
-        if not path.exists():
+        try:
+            path = pathlib.Path(descriptor)
             path.mkdir(parents=True, exist_ok=True)
-        return FileSystemFeatureStore(path)
-
-
-class DragonDict:
-    """Mock implementation of a dragon dictionary"""
-
-    def __init__(self) -> None:
-        """Initialize the mock DragonDict instance"""
-        self._storage: t.Dict[bytes, t.Any] = {}
-
-    def __getitem__(self, key: bytes) -> t.Any:
-        """Retrieve an item using key
-        :param key: Unique key of an item to retrieve from the feature store"""
-        return self._storage[key]
-
-    def __setitem__(self, key: bytes, value: t.Any) -> None:
-        """Assign a value using key
-        :param key: Unique key of an item to set in the feature store
-        :param value: Value to persist in the feature store"""
-        self._storage[key] = value
-
-    def __contains__(self, key: bytes) -> bool:
-        """Return `True` if the key is found, `False` otherwise
-        :param key: Unique key of an item to retrieve from the feature store"""
-        return key in self._storage
+            if not path.is_dir():
+                raise ValueError("FileSystemFeatureStore requires a directory path")
+            if not path.exists():
+                path.mkdir(parents=True, exist_ok=True)
+            return FileSystemFeatureStore(path)
+        except:
+            logger.error(f"Error while creating FileSystemFeatureStore: {descriptor}")
+            raise
diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py
index 4c677eb4c..7141eacec 100644
--- a/tests/dragon/utils/channel.py
+++ b/tests/dragon/utils/channel.py
@@ -25,6 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pathlib
+import threading
 import typing as t
 
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
@@ -38,6 +39,8 @@ class FileSystemCommChannel(CommChannelBase):
 
     def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
         """Initialize the FileSystemCommChannel instance"""
+        self._lock = threading.RLock()
+
         if not isinstance(key, bytes):
             super().__init__(key.as_posix().encode("utf-8"))
             self._file_path = key
@@ -56,20 +59,28 @@ def send(self, value: bytes) -> None:
         logger.debug(
             f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}"
         )
-        self._file_path.write_bytes(value)
+        with self._lock:
+            self._file_path.write_bytes(value)
 
     def recv(self) -> bytes:
         """Receieve a message through the underlying communication channel
         :returns: the received message"""
-        ...
+        with self._lock:
+            if self._file_path.exists():
+                incoming = self._file_path.read_bytes()
+                self._file_path.unlink()
+                return incoming
 
     @classmethod
     def from_descriptor(
         cls,
         descriptor: t.Union[str, bytes],
     ) -> "FileSystemCommChannel":
-        if isinstance(descriptor, str):
-            path = pathlib.Path(descriptor)
-        else:
-            path = pathlib.Path(descriptor.decode("utf-8"))
-        return FileSystemCommChannel(path)
+        try:
+            if isinstance(descriptor, str):
+                path = pathlib.Path(descriptor)
+            else:
+                path = pathlib.Path(descriptor.decode("utf-8"))
+            return FileSystemCommChannel(path)
+        except:
+            print("failed to create FS comm channel: {descriptor}")
diff --git a/tests/mli/channel.py b/tests/mli/channel.py
index 743a21595..bf155b24b 100644
--- a/tests/mli/channel.py
+++ b/tests/mli/channel.py
@@ -25,6 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pathlib
+import threading
 import typing as t
 
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
@@ -38,6 +39,7 @@ class FileSystemCommChannel(CommChannelBase):
 
     def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
         """Initialize the FileSystemCommChannel instance"""
+        self._lock = threading.RLock()
         if not isinstance(key, bytes):
             super().__init__(key.as_posix().encode("utf-8"))
             self._file_path = key
@@ -56,17 +58,26 @@ def send(self, value: bytes) -> None:
         logger.debug(
             f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}"
         )
-        self._file_path.write_bytes(value)
+        with self._lock:
+            self._file_path.write_bytes(value)
 
-    def recv(self) -> t.List[bytes]:
+    def recv(self) -> bytes:
         """Receieve a message through the underlying communication channel
         :returns: the received message"""
-        self._file_path.read_bytes()
+        with self._lock:
+            if self._file_path.exists():
+                incoming = self._file_path.read_bytes()
+                self._file_path.unlink()
+            return incoming
 
     @classmethod
     def from_descriptor(
         cls,
         descriptor: str,
     ) -> "FileSystemCommChannel":
-        path = pathlib.Path(descriptor)
-        return FileSystemCommChannel(path)
+        try:
+            path = pathlib.Path(descriptor)
+            return FileSystemCommChannel(path)
+        except:
+            print(f"failed to create fs comm channel: {descriptor}")
+            raise
diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py
index c15a20a34..ecae32203 100644
--- a/tests/mli/featurestore.py
+++ b/tests/mli/featurestore.py
@@ -29,6 +29,9 @@
 
 import smartsim.error as sse
 from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
 
 
 class MemoryFeatureStore(FeatureStore):
@@ -69,9 +72,13 @@ class FileSystemFeatureStore(FeatureStore):
     """Alternative feature store implementation for testing. Stores all
     data on the file system"""
 
-    def __init__(self, storage_dir: t.Optional[pathlib.Path] = None) -> None:
+    def __init__(
+        self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None
+    ) -> None:
         """Initialize the FileSystemFeatureStore instance
         :param storage_dir: (optional) root directory to store all data relative to"""
+        if isinstance(storage_dir, str):
+            storage_dir = pathlib.Path(storage_dir)
         self._storage_dir = storage_dir
 
     def __getitem__(self, key: str) -> bytes:
@@ -127,33 +134,14 @@ def from_descriptor(
     ) -> "FileSystemFeatureStore":
         # if b64encoded:
         #     descriptor = base64.b64decode(descriptor).encode("utf-8")
-        path = pathlib.Path(descriptor)
-        if not path.is_dir():
-            raise ValueError("FileSystemFeatureStore requires a directory path")
-        if not path.exists():
+        try:
+            path = pathlib.Path(descriptor)
             path.mkdir(parents=True, exist_ok=True)
-        return FileSystemFeatureStore(path)
-
-
-class DragonDict:
-    """Mock implementation of a dragon dictionary"""
-
-    def __init__(self) -> None:
-        """Initialize the mock DragonDict instance"""
-        self._storage: t.Dict[bytes, t.Any] = {}
-
-    def __getitem__(self, key: bytes) -> t.Any:
-        """Retrieve an item using key
-        :param key: Unique key of an item to retrieve from the feature store"""
-        return self._storage[key]
-
-    def __setitem__(self, key: bytes, value: t.Any) -> None:
-        """Assign a value using key
-        :param key: Unique key of an item to set in the feature store
-        :param value: Value to persist in the feature store"""
-        self._storage[key] = value
-
-    def __contains__(self, key: bytes) -> bool:
-        """Return `True` if the key is found, `False` otherwise
-        :param key: Unique key of an item to retrieve from the feature store"""
-        return key in self._storage
+            if not path.is_dir():
+                raise ValueError("FileSystemFeatureStore requires a directory path")
+            if not path.exists():
+                path.mkdir(parents=True, exist_ok=True)
+            return FileSystemFeatureStore(path)
+        except:
+            logger.error(f"Error while creating FileSystemFeatureStore: {descriptor}")
+            raise

From 82fb67a853bf3dd90723fa27759443d1dc3f3f8a Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Mon, 29 Jul 2024 10:45:29 -0500
Subject: [PATCH 06/49] fix

---
 .../_core/mli/comm/channel/dragonchannel.py   |  2 +-
 .../storage/dragonfeaturestore.py             |  2 +-
 tests/dragon/test_environment_loader.py       | 32 +++++++++----------
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index c52c9f68c..d8c0a22ac 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -69,5 +69,5 @@ def from_descriptor(
         try:
             return DragonCommChannel(base64.b64decode(descriptor))
         except:
-            print(f"failed to create dragon comm channel: {descriptor}")
+            logger.error(f"Failed to create dragon comm channel: {descriptor}")
             raise
diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
index 65ebd57b7..96940886b 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
@@ -89,5 +89,5 @@ def from_descriptor(
         try:
             return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor))
         except:
-            print(f"error creating dragon feature store: {descriptor}")
+            logger.error(f"Error creating dragon feature store: {descriptor}")
             raise
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index 12893d3b2..72c3ba4f9 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -24,11 +24,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import pathlib
-
 import pytest
 
-from tests.mli.channel import FileSystemCommChannel
 
 dragon = pytest.importorskip("dragon")
 
@@ -42,7 +39,8 @@
     DragonFeatureStore,
 )
 
-from .featurestore import FileSystemFeatureStore
+from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
+
 
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
@@ -63,8 +61,8 @@ def test_environment_loader_attach_FLI(content: bytes, monkeypatch: pytest.Monke
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=FileSystemCommChannel.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonCommChannel.from_descriptor,
     )
     config_queue = config.get_queue()
 
@@ -84,8 +82,8 @@ def test_environment_loader_serialize_FLI(monkeypatch: pytest.MonkeyPatch):
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=FileSystemCommChannel.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonCommChannel.from_descriptor,
     )
     config_queue = config.get_queue()
     assert config_queue._fli.serialize() == queue.serialize()
@@ -96,8 +94,8 @@ def test_environment_loader_FLI_fails(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("SSQueue", "randomstring")
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=FileSystemCommChannel.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonCommChannel.from_descriptor,
     )
 
     with pytest.raises(DragonFLIError):
@@ -109,13 +107,13 @@ def test_environment_loader_backbone_load_fs(
 ):
     """Verify the file system feature store is loaded correctly by
     the EnvironmentConfigLoader to demonstrate fs_factory correctness"""
-    fs = FileSystemFeatureStore(pathlib.Path(test_dir))
+    fs = DragonFeatureStore(DDict())
     monkeypatch.setenv("SS_DRG_DDICT", fs.descriptor)
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=FileSystemCommChannel.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonCommChannel.from_descriptor,
     )
 
     backbone = config.get_backbone()
@@ -132,8 +130,8 @@ def test_environment_loader_backbone_load_dfs(
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=FileSystemCommChannel.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonCommChannel.from_descriptor,
     )
 
     backbone = config.get_backbone()
@@ -145,8 +143,8 @@ def test_environment_variables_not_set():
     variables are not set"""
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=FileSystemCommChannel.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonCommChannel.from_descriptor,
     )
     assert config.get_backbone() == None
     assert config.get_queue() == None

From 65cf4d1dc7a4547b630b9d4988ec0f0696c40810 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Mon, 29 Jul 2024 11:01:11 -0500
Subject: [PATCH 07/49] fix env loader tests

---
 tests/dragon/test_environment_loader.py | 58 ++++++++-----------------
 1 file changed, 18 insertions(+), 40 deletions(-)

diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index 72c3ba4f9..1b338e1d9 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -26,7 +26,6 @@
 
 import pytest
 
-
 dragon = pytest.importorskip("dragon")
 
 import dragon.utils as du
@@ -34,14 +33,13 @@
 from dragon.data.ddict.ddict import DDict
 from dragon.fli import DragonFLIError, FLInterface
 
+from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader
 from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
     DragonFeatureStore,
 )
 
-from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
-
-
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon
 
@@ -53,7 +51,7 @@
         pytest.param(b"new byte string"),
     ],
 )
-def test_environment_loader_attach_FLI(content: bytes, monkeypatch: pytest.MonkeyPatch):
+def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch):
     """A descriptor can be stored, loaded, and reattached"""
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
@@ -62,18 +60,18 @@ def test_environment_loader_attach_FLI(content: bytes, monkeypatch: pytest.Monke
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
     config_queue = config.get_queue()
 
-    new_sender = config_queue.send(content)
+    _ = config_queue.send(content)
 
     old_recv = queue.recvh()
     result, _ = old_recv.recv_bytes()
     assert result == content
 
 
-def test_environment_loader_serialize_FLI(monkeypatch: pytest.MonkeyPatch):
+def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
     """The serialized descriptors of a loaded and unloaded
     queue are the same"""
     chan = Channel.make_process_local()
@@ -83,55 +81,35 @@ def test_environment_loader_serialize_FLI(monkeypatch: pytest.MonkeyPatch):
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
     config_queue = config.get_queue()
     assert config_queue._fli.serialize() == queue.serialize()
 
 
-def test_environment_loader_FLI_fails(monkeypatch: pytest.MonkeyPatch):
+def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
     """An incorrect serialized descriptor will fails to attach"""
     monkeypatch.setenv("SSQueue", "randomstring")
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonCommChannel.from_descriptor,
+        callback_factory=None,
+        queue_factory=DragonFLIChannel.from_descriptor,
     )
 
     with pytest.raises(DragonFLIError):
-        config_queue = config.get_queue()
+        config.get_queue()
 
 
-def test_environment_loader_backbone_load_fs(
-    monkeypatch: pytest.MonkeyPatch, test_dir: str
-):
-    """Verify the file system feature store is loaded correctly by
-    the EnvironmentConfigLoader to demonstrate fs_factory correctness"""
-    fs = DragonFeatureStore(DDict())
-    monkeypatch.setenv("SS_DRG_DDICT", fs.descriptor)
-
-    config = EnvironmentConfigLoader(
-        featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonCommChannel.from_descriptor,
-    )
-
-    backbone = config.get_backbone()
-    assert backbone is not None
-
-
-def test_environment_loader_backbone_load_dfs(
-    monkeypatch: pytest.MonkeyPatch, test_dir: str
-):
+def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
     """Verify the dragon feature store is loaded correctly by
     the EnvironmentConfigLoader to demonstrate fs_factory correctness"""
-    fs = DragonFeatureStore(DDict())
-    monkeypatch.setenv("SS_DRG_DDICT", fs.descriptor)
+    feature_store = DragonFeatureStore(DDict())
+    monkeypatch.setenv("SS_DRG_DDICT", feature_store.descriptor)
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=DragonCommChannel.from_descriptor,
-        queue_factory=DragonCommChannel.from_descriptor,
+        callback_factory=None,
+        queue_factory=None,
     )
 
     backbone = config.get_backbone()
@@ -146,5 +124,5 @@ def test_environment_variables_not_set():
         callback_factory=DragonCommChannel.from_descriptor,
         queue_factory=DragonCommChannel.from_descriptor,
     )
-    assert config.get_backbone() == None
-    assert config.get_queue() == None
+    assert config.get_backbone() is None
+    assert config.get_queue() is None

From 15806fe33b3ef0f22e08dc4bea5b24a9d1c95f79 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Mon, 29 Jul 2024 11:10:41 -0500
Subject: [PATCH 08/49] move import below conditional

---
 tests/mli/test_worker_manager.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index f48395a76..d2fe85d00 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -32,8 +32,6 @@
 
 import pytest
 
-from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
-
 torch = pytest.importorskip("torch")
 dragon = pytest.importorskip("dragon")
 
@@ -44,6 +42,7 @@
 from dragon import fli
 
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.control.workermanager import (
     EnvironmentConfigLoader,
     WorkerManager,

From cb962be2d05a724c04f953b82ef82e239fe267c1 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Mon, 29 Jul 2024 11:24:04 -0500
Subject: [PATCH 09/49] sort imports for dragon

---
 tests/dragon/test_error_handling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 3231b4af2..ecd2c8e41 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -28,7 +28,6 @@
 
 import pytest
 
-from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 
 dragon = pytest.importorskip("dragon")
 
@@ -37,6 +36,7 @@
 from dragon.data.ddict.ddict import DDict
 from dragon.fli import FLInterface
 
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
 from smartsim._core.mli.infrastructure.control.workermanager import (
     WorkerManager,
     exception_handler,

From 36883c9301ca0ad3d8260134736d5d2a82446b0a Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Mon, 29 Jul 2024 11:44:26 -0500
Subject: [PATCH 10/49] fix feature store type interleaving bug

---
 tests/dragon/test_error_handling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index ecd2c8e41..e071f80ea 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -106,7 +106,7 @@ def setup_worker_manager_model_bytes(
     )
 
     tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
-    output_key = MessageHandler.build_tensor_key("key", f"{test_dir}/out")
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
     model = MessageHandler.build_model(b"model", "model name", "v 0.0.1")
     request = MessageHandler.build_request(
         test_dir, model, [tensor_key], [output_key], [], None
@@ -144,7 +144,7 @@ def setup_worker_manager_model_key(
     )
 
     tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
-    output_key = MessageHandler.build_tensor_key("key", f"{test_dir}/out")
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
     model_key = MessageHandler.build_model_key(
         "model key", app_feature_store.descriptor
     )

From 2e9f146553dbb846487569d9d8a0ae61e489fecd Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Mon, 29 Jul 2024 11:51:17 -0500
Subject: [PATCH 11/49] isort

---
 tests/dragon/test_error_handling.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index e071f80ea..73757014d 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -28,7 +28,6 @@
 
 import pytest
 
-
 dragon = pytest.importorskip("dragon")
 
 import dragon.utils as du

From e011b70ff1646748252b0c8b9af0cc01c0b79612 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Mon, 29 Jul 2024 12:30:06 -0500
Subject: [PATCH 12/49] fix test failing new validation check

---
 tests/mli/test_core_machine_learning_worker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/mli/test_core_machine_learning_worker.py
index c7e1cb286..6fa9f9944 100644
--- a/tests/mli/test_core_machine_learning_worker.py
+++ b/tests/mli/test_core_machine_learning_worker.py
@@ -85,12 +85,12 @@ def persist_torch_tensor(test_dir: str) -> pathlib.Path:
 
 
 @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
-def test_fetch_model_disk(persist_torch_model: pathlib.Path) -> None:
+def test_fetch_model_disk(persist_torch_model: pathlib.Path, test_dir: str) -> None:
     """Verify that the ML worker successfully retrieves a model
     when given a valid (file system) key"""
     worker = MachineLearningWorkerCore
     key = str(persist_torch_model)
-    feature_store = FileSystemFeatureStore()
+    feature_store = FileSystemFeatureStore(test_dir)
     fsd = feature_store.descriptor
     feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes()
 

From e6dae22ba8c40f41ff2984ff5aa7c3b761f62d6d Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 17:50:37 -0500
Subject: [PATCH 13/49] revert gh workflow changes that will be merged later

---
 .github/workflows/run_tests.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index b8e96f05b..8ed348cbd 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -109,7 +109,7 @@ jobs:
       - name: Install SmartSim (with ML backends)
         run: |
           python -m pip install git+https://github.com/CrayLabs/SmartRedis.git@develop#egg=smartredis
-          python -m pip install .[dev,mypy,ml]
+          python -m pip install .[dev,ml]
 
       - name: Install ML Runtimes with Smart (with pt, tf, and onnx support)
         if: (contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')) && ( matrix.subset != 'dragon' )
@@ -129,6 +129,7 @@ jobs:
 
       - name: Run mypy
         run: |
+          python -m pip install .[mypy]
           make check-mypy
 
       - name: Run Pylint

From 4548fec2a75207ae1cd9336fababc613661e0dcc Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 17:55:42 -0500
Subject: [PATCH 14/49] add missing docstrings, remove commented parameters

---
 smartsim/_core/mli/comm/channel/dragonchannel.py            | 3 +++
 smartsim/_core/mli/comm/channel/dragonfli.py                | 2 ++
 .../_core/mli/infrastructure/storage/dragonfeaturestore.py  | 5 +++--
 tests/dragon/featurestore.py                                | 6 +++---
 tests/dragon/utils/channel.py                               | 3 +++
 tests/mli/channel.py                                        | 3 +++
 tests/mli/featurestore.py                                   | 6 +++---
 7 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index d8c0a22ac..c9eca9046 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -66,6 +66,9 @@ def from_descriptor(
         cls,
         descriptor: str,
     ) -> "DragonCommChannel":
+        """A factory method that creates an instance from a descriptor string
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached DragonCommChannel"""
         try:
             return DragonCommChannel(base64.b64decode(descriptor))
         except:
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index 84f49fd52..ff95b2889 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -76,6 +76,8 @@ def from_descriptor(
         cls,
         descriptor: str,
     ) -> "DragonFLIChannel":
+        """A factory method that creates an instance from a descriptor string
+        :param descriptor: The descriptor that uniquely identifies the resource"""
         try:
             return DragonFLIChannel(
                 fli_desc=base64.b64decode(descriptor),
diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
index 96940886b..a90c1f901 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
@@ -83,9 +83,10 @@ def descriptor(self) -> str:
     def from_descriptor(
         cls,
         descriptor: str,
-        # b64encoded: bool = False,
     ) -> "DragonFeatureStore":
-
+        """A factory method that creates an instance from a descriptor string
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached DragonFeatureStore"""
         try:
             return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor))
         except:
diff --git a/tests/dragon/featurestore.py b/tests/dragon/featurestore.py
index 352cd8661..f8c645f6e 100644
--- a/tests/dragon/featurestore.py
+++ b/tests/dragon/featurestore.py
@@ -130,10 +130,10 @@ def descriptor(self) -> str:
     def from_descriptor(
         cls,
         descriptor: str,
-        # b64encoded: bool = False,
     ) -> "FileSystemFeatureStore":
-        # if b64encoded:
-        #     descriptor = base64.b64decode(descriptor).encode("utf-8")
+        """A factory method that creates an instance from a descriptor string
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemFeatureStore"""
         try:
             path = pathlib.Path(descriptor)
             path.mkdir(parents=True, exist_ok=True)
diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py
index 7141eacec..7efe9b523 100644
--- a/tests/dragon/utils/channel.py
+++ b/tests/dragon/utils/channel.py
@@ -76,6 +76,9 @@ def from_descriptor(
         cls,
         descriptor: t.Union[str, bytes],
     ) -> "FileSystemCommChannel":
+        """A factory method that creates an instance from a descriptor string
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemCommChannel"""
         try:
             if isinstance(descriptor, str):
                 path = pathlib.Path(descriptor)
diff --git a/tests/mli/channel.py b/tests/mli/channel.py
index bf155b24b..9ae61a89b 100644
--- a/tests/mli/channel.py
+++ b/tests/mli/channel.py
@@ -75,6 +75,9 @@ def from_descriptor(
         cls,
         descriptor: str,
     ) -> "FileSystemCommChannel":
+        """A factory method that creates an instance from a descriptor string
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemCommChannel"""
         try:
             path = pathlib.Path(descriptor)
             return FileSystemCommChannel(path)
diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py
index ecae32203..5545168b7 100644
--- a/tests/mli/featurestore.py
+++ b/tests/mli/featurestore.py
@@ -130,10 +130,10 @@ def descriptor(self) -> str:
     def from_descriptor(
         cls,
         descriptor: str,
-        # b64encoded: bool = False,
     ) -> "FileSystemFeatureStore":
-        # if b64encoded:
-        #     descriptor = base64.b64decode(descriptor).encode("utf-8")
+        """A factory method that creates an instance from a descriptor string
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemFeatureStore"""
         try:
             path = pathlib.Path(descriptor)
             path.mkdir(parents=True, exist_ok=True)

From c29dc6b3d4defdc083bc5509f0a431f8e62f2c41 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 17:56:49 -0500
Subject: [PATCH 15/49] docstring

---
 smartsim/_core/mli/comm/channel/dragonfli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index ff95b2889..2cbcb6944 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -77,7 +77,8 @@ def from_descriptor(
         descriptor: str,
     ) -> "DragonFLIChannel":
         """A factory method that creates an instance from a descriptor string
-        :param descriptor: The descriptor that uniquely identifies the resource"""
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached DragonFLIChannel"""
         try:
             return DragonFLIChannel(
                 fli_desc=base64.b64decode(descriptor),

From 24cbef2b78fb41e378b186bc47d495428769a806 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 17:57:24 -0500
Subject: [PATCH 16/49] remove commented out imports

---
 smartsim/_core/mli/infrastructure/control/workermanager.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index b7e409e46..6f960ced9 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -45,12 +45,8 @@
 
 if t.TYPE_CHECKING:
     from dragon.fli import FLInterface
-
-    # from smartsim._core.mli.mli_schemas.model.model_capnp import Model
     from smartsim._core.mli.mli_schemas.response.response_capnp import Status
 
-    # from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor
-
 logger = get_logger(__name__)
 
 

From 4eb29b993a9b5bab0ae6061d6df454e7ba5bd863 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 17:59:07 -0500
Subject: [PATCH 17/49] remove commented out code

---
 smartsim/_core/mli/infrastructure/control/workermanager.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 6f960ced9..2ffb4d97e 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -92,10 +92,8 @@ def __init__(
         self,
         config_loader: EnvironmentConfigLoader,
         worker: MachineLearningWorkerBase,
-        # fs_factory: t.Callable[[str], FeatureStore],
         as_service: bool = False,
         cooldown: int = 0,
-        # comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel,
         device: t.Literal["cpu", "gpu"] = "cpu",
     ) -> None:
         """Initialize the WorkerManager

From eb793b600e3b516324a54909d378b118182b3dbd Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 18:02:36 -0500
Subject: [PATCH 18/49] improve documentation on purpose of backbone fs

---
 smartsim/_core/mli/infrastructure/control/workermanager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 2ffb4d97e..940f70f98 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -122,7 +122,8 @@ def __init__(
         self._fs_factory = config_loader._featurestore_factory
         """A factory method to create a desired feature store client type"""
         self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone()
-        """The backbone feature store"""
+        """A standalone, system-created feature store used to share internal
+        information among MLI components"""
 
     def _check_feature_stores(self, request: InferenceRequest) -> bool:
         """Ensures that all feature stores required by the request are available

From 318deacb2f8c17c95c897a892550136a2fe044cc Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 18:10:51 -0500
Subject: [PATCH 19/49] improve documentatoin about backbone usage

---
 smartsim/_core/mli/infrastructure/environmentloader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index 3c64fffe9..81551093f 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -65,7 +65,8 @@ def __init__(
 
     def get_backbone(self) -> t.Optional[FeatureStore]:
         """Create the backbone feature store using the descriptor found in
-        an environment variable"""
+        an environment variable. The backbone is a standalone, system-created
+        feature store used to share internal information among MLI components"""
         descriptor = self._backbone_descriptor or os.getenv("SS_DRG_DDICT", None)
         if self._featurestore_factory is None:
             logger.warning("No feature store factory is configured")

From 0eac344f0f4112971043de4ae3ec3240576a39da Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 18:14:15 -0500
Subject: [PATCH 20/49] remove deprecated & add missing docstring params

---
 smartsim/_core/mli/infrastructure/control/workermanager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 940f70f98..57254de93 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -103,7 +103,7 @@ def __init__(
         :param as_service: Specifies run-once or run-until-complete behavior of service
         :param cooldown: Number of seconds to wait before shutting down after
         shutdown criteria are met
-        :param comm_channel_type: The type of communication channel used for callbacks
+        :param device: The type of hardware the workers must be executed on
         """
         super().__init__(as_service, cooldown)
 

From d3b951284323e6347e5612948e186d591f6b09fd Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 18:15:41 -0500
Subject: [PATCH 21/49] fix renamed param in docstring

---
 smartsim/_core/mli/infrastructure/worker/worker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
index f1d0775f0..89fb63524 100644
--- a/smartsim/_core/mli/infrastructure/worker/worker.py
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -163,7 +163,8 @@ def deserialize_message(
     ) -> InferenceRequest:
         """Deserialize a message from a byte stream into an InferenceRequest
         :param data_blob: The byte stream to deserialize
-        :param channel_type: Type to be used for callback communications
+        :param callback_factory: A factory method that can create an instance
+        of the desired concrete comm channel type
         :returns: The raw input message deserialized into an InferenceRequest
         """
         request = MessageHandler.deserialize_request(data_blob)

From 6e387e80dbf881c0c28c4cbc0b40df3593017f1a Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 18:19:26 -0500
Subject: [PATCH 22/49] remove commented lines

---
 tests/mli/test_worker_manager.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index d2fe85d00..026d1f32f 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -180,8 +180,6 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
         integrated_worker,
         as_service=True,
         cooldown=5,
-        # comm_channel_type=FileSystemCommChannel,
-        # featurestore_factory=FileSystemFeatureStore.from_descriptor,
         device="cpu",
     )
 
@@ -203,7 +201,7 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
     )
     msg_pump.start()
 
-    # # create a process to process commands
+    # create a process to execute commands
     process = mp.Process(target=worker_manager.execute)
     process.start()
     process.join(timeout=5)

From a89f1608fe339e0f9caacb1cabc2e952190032a0 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 18:20:27 -0500
Subject: [PATCH 23/49] remove commented lines

---
 tests/dragon/utils/worker.py | 26 --------------------------
 tests/mli/worker.py          | 26 --------------------------
 2 files changed, 52 deletions(-)

diff --git a/tests/dragon/utils/worker.py b/tests/dragon/utils/worker.py
index f6c8120e0..0582cae56 100644
--- a/tests/dragon/utils/worker.py
+++ b/tests/dragon/utils/worker.py
@@ -96,35 +96,9 @@ def transform_output(
         execute_result: mliw.ExecuteResult,
         result_device: str,
     ) -> mliw.TransformOutputResult:
-        # transformed = [item.clone() for item in execute_result.predictions]
-        # return OutputTransformResult(transformed)
-
-        # transformed = [item.bytes() for item in execute_result.predictions]
-
-        # OutputTransformResult.transformed SHOULD be a list of
-        # capnproto Tensors Or tensor descriptors accompanying bytes
-
         # send the original tensors...
         execute_result.predictions = [t.detach() for t in execute_result.predictions]
         # todo: solve sending all tensor metadata that coincisdes with each prediction
         return mliw.TransformOutputResult(
             execute_result.predictions, [1], "c", "float32"
         )
-        # return OutputTransformResult(transformed)
-
-    # @staticmethod
-    # def serialize_reply(
-    #     request: InferenceRequest, results: OutputTransformResult
-    # ) -> t.Any:
-    #     # results = IntegratedTorchWorker._prepare_outputs(results.outputs)
-    #     # return results
-    #     return None
-    #     # response = MessageHandler.build_response(
-    #     #     status=200,  # todo: are we satisfied with 0/1 (success, fail)
-    #     #     # todo: if not detailed messages, this shouldn't be returned.
-    #     #     message="success",
-    #     #     result=results,
-    #     #     custom_attributes=None,
-    #     # )
-    #     # serialized_resp = MessageHandler.serialize_response(response)
-    #     # return serialized_resp
diff --git a/tests/mli/worker.py b/tests/mli/worker.py
index f6c8120e0..0582cae56 100644
--- a/tests/mli/worker.py
+++ b/tests/mli/worker.py
@@ -96,35 +96,9 @@ def transform_output(
         execute_result: mliw.ExecuteResult,
         result_device: str,
     ) -> mliw.TransformOutputResult:
-        # transformed = [item.clone() for item in execute_result.predictions]
-        # return OutputTransformResult(transformed)
-
-        # transformed = [item.bytes() for item in execute_result.predictions]
-
-        # OutputTransformResult.transformed SHOULD be a list of
-        # capnproto Tensors Or tensor descriptors accompanying bytes
-
         # send the original tensors...
         execute_result.predictions = [t.detach() for t in execute_result.predictions]
         # todo: solve sending all tensor metadata that coincisdes with each prediction
         return mliw.TransformOutputResult(
             execute_result.predictions, [1], "c", "float32"
         )
-        # return OutputTransformResult(transformed)
-
-    # @staticmethod
-    # def serialize_reply(
-    #     request: InferenceRequest, results: OutputTransformResult
-    # ) -> t.Any:
-    #     # results = IntegratedTorchWorker._prepare_outputs(results.outputs)
-    #     # return results
-    #     return None
-    #     # response = MessageHandler.build_response(
-    #     #     status=200,  # todo: are we satisfied with 0/1 (success, fail)
-    #     #     # todo: if not detailed messages, this shouldn't be returned.
-    #     #     message="success",
-    #     #     result=results,
-    #     #     custom_attributes=None,
-    #     # )
-    #     # serialized_resp = MessageHandler.serialize_response(response)
-    #     # return serialized_resp

From 73c7f9b454d4107b5b458a47ac1f38b3a53c4812 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 18:22:16 -0500
Subject: [PATCH 24/49] formatting

---
 smartsim/_core/mli/infrastructure/control/workermanager.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 57254de93..ee1ba6e83 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -45,6 +45,7 @@
 
 if t.TYPE_CHECKING:
     from dragon.fli import FLInterface
+
     from smartsim._core.mli.mli_schemas.response.response_capnp import Status
 
 logger = get_logger(__name__)

From a5bda09a15489bee68097f327c517eef6a042dcd Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 31 Jul 2024 18:53:04 -0500
Subject: [PATCH 25/49] revert dupe change from upstream

---
 .github/workflows/run_tests.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 8ed348cbd..b8e96f05b 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -109,7 +109,7 @@ jobs:
       - name: Install SmartSim (with ML backends)
         run: |
           python -m pip install git+https://github.com/CrayLabs/SmartRedis.git@develop#egg=smartredis
-          python -m pip install .[dev,ml]
+          python -m pip install .[dev,mypy,ml]
 
       - name: Install ML Runtimes with Smart (with pt, tf, and onnx support)
         if: (contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')) && ( matrix.subset != 'dragon' )
@@ -129,7 +129,6 @@ jobs:
 
       - name: Run mypy
         run: |
-          python -m pip install .[mypy]
           make check-mypy
 
       - name: Run Pylint

From d50a540210e9880c5ccf2e441141195f4353a365 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Thu, 1 Aug 2024 16:27:10 -0500
Subject: [PATCH 26/49] fix confusing docstring

---
 smartsim/_core/mli/infrastructure/environmentloader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index 81551093f..265de3d9d 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -64,7 +64,7 @@ def __init__(
         for inference requests"""
 
     def get_backbone(self) -> t.Optional[FeatureStore]:
-        """Create the backbone feature store using the descriptor found in
+        """Attach to the backbone feature store using the descriptor found in
         an environment variable. The backbone is a standalone, system-created
         feature store used to share internal information among MLI components"""
         descriptor = self._backbone_descriptor or os.getenv("SS_DRG_DDICT", None)

From 86b4c2e1454b42bdfd183fd081e2d9a07334bf70 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Thu, 1 Aug 2024 16:59:30 -0500
Subject: [PATCH 27/49] fix incomplete docstrings, tweak logs

---
 .../infrastructure/control/workermanager.py   | 20 +++++++++++++------
 .../mli/infrastructure/environmentloader.py   | 16 +++++++++++++--
 tests/dragon/utils/channel.py                 |  3 ++-
 tests/mli/channel.py                          |  3 ++-
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index ee1ba6e83..9928b4cd3 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -128,7 +128,9 @@ def __init__(
 
     def _check_feature_stores(self, request: InferenceRequest) -> bool:
         """Ensures that all feature stores required by the request are available
-        :param request: The request to validate"""
+        :param request: The request to validate
+        :returns: False if feature store validation fails for the request, True otherwise
+        """
         # collect all feature stores required by the request
         fs_model: t.Set[str] = set()
         if request.model_key:
@@ -147,7 +149,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool:
 
         # create the feature stores we need to service request
         if fs_missing:
-            logger.info(f"Missing feature store(s): {fs_missing}")
+            logger.debug(f"Adding feature store(s): {fs_missing}")
             for descriptor in fs_missing:
                 feature_store = self._fs_factory(descriptor)
                 self._feature_stores[descriptor] = feature_store
@@ -156,7 +158,9 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool:
 
     def _check_model(self, request: InferenceRequest) -> bool:
         """Ensure that a model is available for the request
-        :param request: The request to validate"""
+        :param request: The request to validate
+        :returns: False if model validation fails for the request, True otherwise
+        """
         if request.model_key or request.raw_model:
             return True
 
@@ -165,7 +169,9 @@ def _check_model(self, request: InferenceRequest) -> bool:
 
     def _check_inputs(self, request: InferenceRequest) -> bool:
         """Ensure that inputs are available for the request
-        :param request: The request to validate"""
+        :param request: The request to validate
+        :returns: False if input validation fails for the request, True otherwise
+        """
         if request.input_keys or request.raw_inputs:
             return True
 
@@ -174,7 +180,9 @@ def _check_inputs(self, request: InferenceRequest) -> bool:
 
     def _check_callback(self, request: InferenceRequest) -> bool:
         """Ensure that a callback channel is available for the request
-        :param request: The request to validate"""
+        :param request: The request to validate
+        :returns: False if callback validation fails for the request, True otherwise
+        """
         if request.callback is not None:
             return True
 
@@ -184,7 +192,7 @@ def _check_callback(self, request: InferenceRequest) -> bool:
     def _validate_request(self, request: InferenceRequest) -> bool:
         """Ensure the request can be processed.
         :param request: The request to validate
-        :return: True if the request is valid, False otherwise"""
+        :return: False if the request fails any validation checks, True otherwise"""
         checks = [
             self._check_feature_stores(request),
             self._check_model(request),
diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index 265de3d9d..f7056f4a0 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -46,6 +46,15 @@ def __init__(
         callback_factory: t.Callable[[bytes], CommChannelBase],
         queue_factory: t.Callable[[str], CommChannelBase],
     ) -> None:
+        """Initialize the config loader instance with the factories necessary for
+        creating additional objects.
+
+        :param featurestore_factory: A factory method that produces a feature store
+        given a descriptor
+        :param callback_factory: A factory method that produces a callback
+        channel given a descriptor
+        :param featurestore_factory: A factory method that produces a queue
+        channel given a descriptor"""
         self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None)
         """The descriptor used to attach to the incoming event queue"""
         self.queue: t.Optional[CommChannelBase] = None
@@ -66,7 +75,8 @@ def __init__(
     def get_backbone(self) -> t.Optional[FeatureStore]:
         """Attach to the backbone feature store using the descriptor found in
         an environment variable. The backbone is a standalone, system-created
-        feature store used to share internal information among MLI components"""
+        feature store used to share internal information among MLI components
+        :returns: The attached feature store via SS_DRG_DDICT"""
         descriptor = self._backbone_descriptor or os.getenv("SS_DRG_DDICT", None)
         if self._featurestore_factory is None:
             logger.warning("No feature store factory is configured")
@@ -78,7 +88,9 @@ def get_backbone(self) -> t.Optional[FeatureStore]:
         return self.backbone
 
     def get_queue(self) -> t.Optional[CommChannelBase]:
-        """Returns the Queue previously set in SSQueue"""
+        """Attach to a queue-like communication channel using the descriptor
+        found in an environment variable.
+        :returns: The attached queue specified via SSQueue"""
         descriptor = self._queue_descriptor or os.getenv("SSQueue", None)
         if self._queue_factory is None:
             logger.warning("No queue factory is configured")
diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py
index 7efe9b523..4314b494e 100644
--- a/tests/dragon/utils/channel.py
+++ b/tests/dragon/utils/channel.py
@@ -38,7 +38,8 @@ class FileSystemCommChannel(CommChannelBase):
     """Passes messages by writing to a file"""
 
     def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
-        """Initialize the FileSystemCommChannel instance"""
+        """Initialize the FileSystemCommChannel instance
+        :param key: a path to the root directory of the feature store"""
         self._lock = threading.RLock()
 
         if not isinstance(key, bytes):
diff --git a/tests/mli/channel.py b/tests/mli/channel.py
index 9ae61a89b..9e8acd359 100644
--- a/tests/mli/channel.py
+++ b/tests/mli/channel.py
@@ -38,7 +38,8 @@ class FileSystemCommChannel(CommChannelBase):
     """Passes messages by writing to a file"""
 
     def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
-        """Initialize the FileSystemCommChannel instance"""
+        """Initialize the FileSystemCommChannel instance
+        :param key: a path to the root directory of the feature store"""
         self._lock = threading.RLock()
         if not isinstance(key, bytes):
             super().__init__(key.as_posix().encode("utf-8"))

From 23464837c542fd707924df444804c4a48959f3be Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Thu, 1 Aug 2024 17:56:10 -0500
Subject: [PATCH 28/49] docstring fix

---
 smartsim/_core/mli/comm/channel/dragonfli.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index 2cbcb6944..b4b4c2e5b 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -43,7 +43,10 @@ class DragonFLIChannel(cch.CommChannelBase):
     """Passes messages by writing to a Dragon FLI Channel"""
 
     def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None:
-        """Initialize the DragonFLIChannel instance"""
+        """Initialize the DragonFLIChannel instance
+        :param fli_desc: the descriptor of the FLI channel to attach
+        :param sender_supplied: flag indicating if the FLI uses sender-supplied streams
+        """
         super().__init__(fli_desc)
         # todo: do we need memory pool information to construct the channel correctly?
         self._fli: "fli" = fli.FLInterface.attach(fli_desc)

From d4194659592b02ee2e24b2313b6dff1dab9e6b46 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Thu, 1 Aug 2024 18:11:20 -0500
Subject: [PATCH 29/49] validate & report env config loader attempts to call
 factories

---
 .../mli/infrastructure/environmentloader.py   | 37 ++++++++++---------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index f7056f4a0..21cac2731 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -36,15 +36,14 @@
 
 class EnvironmentConfigLoader:
     """
-    Facilitates the loading of a FeatureStore and Queue
-    into the WorkerManager.
+    Facilitates the loading of a FeatureStore and Queue into the WorkerManager.
     """
 
     def __init__(
         self,
-        featurestore_factory: t.Callable[[str], FeatureStore],
-        callback_factory: t.Callable[[bytes], CommChannelBase],
-        queue_factory: t.Callable[[str], CommChannelBase],
+        featurestore_factory: t.Optional[t.Callable[[str], FeatureStore]] = None,
+        callback_factory: t.Optional[t.Callable[[bytes], CommChannelBase]] = None,
+        queue_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None,
     ) -> None:
         """Initialize the config loader instance with the factories necessary for
         creating additional objects.
@@ -53,14 +52,10 @@ def __init__(
         given a descriptor
         :param callback_factory: A factory method that produces a callback
         channel given a descriptor
-        :param featurestore_factory: A factory method that produces a queue
+        :param queue_factory: A factory method that produces a queue
         channel given a descriptor"""
-        self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None)
-        """The descriptor used to attach to the incoming event queue"""
         self.queue: t.Optional[CommChannelBase] = None
         """The attached incoming event queue channel"""
-        self._backbone_descriptor: t.Optional[str] = os.getenv("SS_DRG_DDICT", None)
-        """The descriptor used to attach to the backbone feature store"""
         self.backbone: t.Optional[FeatureStore] = None
         """The attached backbone feature store"""
         self._featurestore_factory = featurestore_factory
@@ -76,27 +71,33 @@ def get_backbone(self) -> t.Optional[FeatureStore]:
         """Attach to the backbone feature store using the descriptor found in
         an environment variable. The backbone is a standalone, system-created
         feature store used to share internal information among MLI components
+
         :returns: The attached feature store via SS_DRG_DDICT"""
-        descriptor = self._backbone_descriptor or os.getenv("SS_DRG_DDICT", None)
+        descriptor = os.getenv("SS_DRG_DDICT", "")
+
+        if not descriptor:
+            logger.warning("No backbone descriptor is configured")
+
         if self._featurestore_factory is None:
             logger.warning("No feature store factory is configured")
             return None
 
-        if descriptor is not None:
-            self.backbone = self._featurestore_factory(descriptor)
-            self._backbone_descriptor = descriptor
+        self.backbone = self._featurestore_factory(descriptor)
         return self.backbone
 
     def get_queue(self) -> t.Optional[CommChannelBase]:
         """Attach to a queue-like communication channel using the descriptor
         found in an environment variable.
+
         :returns: The attached queue specified via SSQueue"""
-        descriptor = self._queue_descriptor or os.getenv("SSQueue", None)
+        descriptor = os.getenv("SSQueue", "")
+
+        if not descriptor:
+            logger.warning("No queue descriptor is configured")
+
         if self._queue_factory is None:
             logger.warning("No queue factory is configured")
             return None
 
-        if descriptor is not None and descriptor:
-            self.queue = self._queue_factory(descriptor)
-            self._queue_descriptor = descriptor
+        self.queue = self._queue_factory(descriptor)
         return self.queue

From 85a6ee049b7c5dd4f369664690371f32c2680de1 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Thu, 1 Aug 2024 18:44:58 -0500
Subject: [PATCH 30/49] report validation failures in MLI pipeline through
 callback

---
 smartsim/_core/mli/infrastructure/control/workermanager.py | 6 +++++-
 smartsim/_core/mli/infrastructure/environmentloader.py     | 6 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 9928b4cd3..eb1273b04 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -235,7 +235,11 @@ def _on_iteration(self) -> None:
             request.raw_inputs = tensor_bytes_list
 
         if not self._validate_request(request):
-            return
+            exception_handler(
+                ValueError("Error validating the request"),
+                request.callback,
+                "Error validating the request.",
+            )
 
         timings.append(time.perf_counter() - interm)  # timing
         interm = time.perf_counter()  # timing
diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index 21cac2731..c62645f3f 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -41,9 +41,9 @@ class EnvironmentConfigLoader:
 
     def __init__(
         self,
-        featurestore_factory: t.Optional[t.Callable[[str], FeatureStore]] = None,
-        callback_factory: t.Optional[t.Callable[[bytes], CommChannelBase]] = None,
-        queue_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None,
+        featurestore_factory: t.Callable[[str], FeatureStore],
+        callback_factory: t.Callable[[bytes], CommChannelBase],
+        queue_factory: t.Callable[[str], CommChannelBase],
     ) -> None:
         """Initialize the config loader instance with the factories necessary for
         creating additional objects.

From d9a30d78e89764fde6c6e3ffb7e1bf7fd36eb538 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Fri, 2 Aug 2024 14:55:31 -0500
Subject: [PATCH 31/49] fix removal of early return on empty descriptors

---
 smartsim/_core/mli/infrastructure/environmentloader.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index c62645f3f..ea8e6b2ad 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -77,6 +77,7 @@ def get_backbone(self) -> t.Optional[FeatureStore]:
 
         if not descriptor:
             logger.warning("No backbone descriptor is configured")
+            return None
 
         if self._featurestore_factory is None:
             logger.warning("No feature store factory is configured")
@@ -94,6 +95,7 @@ def get_queue(self) -> t.Optional[CommChannelBase]:
 
         if not descriptor:
             logger.warning("No queue descriptor is configured")
+            return None
 
         if self._queue_factory is None:
             logger.warning("No queue factory is configured")

From 5f9c727f9fc75c4cf784322a0a418d922eaa1c4d Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Fri, 2 Aug 2024 15:26:45 -0500
Subject: [PATCH 32/49] format docstrings to render correctly

---
 smartsim/_core/mli/comm/channel/channel.py    |  2 ++
 .../_core/mli/comm/channel/dragonchannel.py   |  2 ++
 smartsim/_core/mli/comm/channel/dragonfli.py  |  4 +++
 .../infrastructure/control/workermanager.py   |  9 +++++-
 .../storage/dragonfeaturestore.py             | 12 +++++---
 .../infrastructure/storage/featurestore.py    | 11 ++++---
 tests/dragon/featurestore.py                  | 29 ++++++++++++-------
 tests/dragon/utils/channel.py                 |  4 +++
 tests/mli/channel.py                          |  4 +++
 tests/mli/featurestore.py                     | 24 ++++++++++-----
 10 files changed, 74 insertions(+), 27 deletions(-)

diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py
index a3cce2181..d91859126 100644
--- a/smartsim/_core/mli/comm/channel/channel.py
+++ b/smartsim/_core/mli/comm/channel/channel.py
@@ -42,11 +42,13 @@ def __init__(self, descriptor: t.Union[str, bytes]) -> None:
     @abstractmethod
     def send(self, value: bytes) -> None:
         """Send a message through the underlying communication channel
+
         :param value: The value to send"""
 
     @abstractmethod
     def recv(self) -> t.List[bytes]:
         """Receieve a message through the underlying communication channel
+
         :returns: the received message"""
 
     @property
diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
index c9eca9046..80fdd9cdc 100644
--- a/smartsim/_core/mli/comm/channel/dragonchannel.py
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -56,6 +56,7 @@ def send(self, value: bytes) -> None:
 
     def recv(self) -> t.List[bytes]:
         """Receieve a message through the underlying communication channel
+
         :returns: the received message"""
         with self._channel.recvh(timeout=None) as recvh:
             message_bytes: bytes = recvh.recv_bytes(timeout=None)
@@ -67,6 +68,7 @@ def from_descriptor(
         descriptor: str,
     ) -> "DragonCommChannel":
         """A factory method that creates an instance from a descriptor string
+
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached DragonCommChannel"""
         try:
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
index b4b4c2e5b..4636894bd 100644
--- a/smartsim/_core/mli/comm/channel/dragonfli.py
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -44,6 +44,7 @@ class DragonFLIChannel(cch.CommChannelBase):
 
     def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None:
         """Initialize the DragonFLIChannel instance
+
         :param fli_desc: the descriptor of the FLI channel to attach
         :param sender_supplied: flag indicating if the FLI uses sender-supplied streams
         """
@@ -56,12 +57,14 @@ def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None:
 
     def send(self, value: bytes) -> None:
         """Send a message through the underlying communication channel
+
         :param value: The value to send"""
         with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh:
             sendh.send_bytes(value)
 
     def recv(self) -> t.List[bytes]:
         """Receieve a message through the underlying communication channel
+
         :returns: the received message"""
         messages = []
         eot = False
@@ -80,6 +83,7 @@ def from_descriptor(
         descriptor: str,
     ) -> "DragonFLIChannel":
         """A factory method that creates an instance from a descriptor string
+
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached DragonFLIChannel"""
         try:
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index eb1273b04..6f52d5364 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -98,6 +98,7 @@ def __init__(
         device: t.Literal["cpu", "gpu"] = "cpu",
     ) -> None:
         """Initialize the WorkerManager
+
         :param config_loader: Environment config loader that loads the task queue and
         feature store
         :param workers: A worker to manage
@@ -128,6 +129,7 @@ def __init__(
 
     def _check_feature_stores(self, request: InferenceRequest) -> bool:
         """Ensures that all feature stores required by the request are available
+
         :param request: The request to validate
         :returns: False if feature store validation fails for the request, True otherwise
         """
@@ -158,6 +160,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool:
 
     def _check_model(self, request: InferenceRequest) -> bool:
         """Ensure that a model is available for the request
+
         :param request: The request to validate
         :returns: False if model validation fails for the request, True otherwise
         """
@@ -169,6 +172,7 @@ def _check_model(self, request: InferenceRequest) -> bool:
 
     def _check_inputs(self, request: InferenceRequest) -> bool:
         """Ensure that inputs are available for the request
+
         :param request: The request to validate
         :returns: False if input validation fails for the request, True otherwise
         """
@@ -180,6 +184,7 @@ def _check_inputs(self, request: InferenceRequest) -> bool:
 
     def _check_callback(self, request: InferenceRequest) -> bool:
         """Ensure that a callback channel is available for the request
+
         :param request: The request to validate
         :returns: False if callback validation fails for the request, True otherwise
         """
@@ -190,7 +195,8 @@ def _check_callback(self, request: InferenceRequest) -> bool:
         return False
 
     def _validate_request(self, request: InferenceRequest) -> bool:
-        """Ensure the request can be processed.
+        """Ensure the request can be processed
+
         :param request: The request to validate
         :return: False if the request fails any validation checks, True otherwise"""
         checks = [
@@ -204,6 +210,7 @@ def _validate_request(self, request: InferenceRequest) -> bool:
 
     def _on_iteration(self) -> None:
         """Executes calls to the machine learning worker implementation to complete
+
         the inference pipeline"""
         logger.debug("executing worker manager pipeline")
 
diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
index a90c1f901..5f42ef0bd 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
@@ -48,6 +48,7 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None:
 
     def __getitem__(self, key: str) -> t.Union[str, bytes]:
         """Retrieve an item using key
+
         :param key: Unique key of an item to retrieve from the feature store"""
         try:
             value: t.Union[str, bytes] = self._storage[key]
@@ -62,20 +63,22 @@ def __getitem__(self, key: str) -> t.Union[str, bytes]:
 
     def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None:
         """Assign a value using key
+
         :param key: Unique key of an item to set in the feature store
         :param value: Value to persist in the feature store"""
         self._storage[key] = value
 
     def __contains__(self, key: str) -> bool:
         """Membership operator to test for a key existing within the feature store.
-        Return `True` if the key is found, `False` otherwise
-        :param key: Unique key of an item to retrieve from the feature store"""
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
         return key in self._storage
 
     @property
     def descriptor(self) -> str:
-        """Return a unique identifier enabling a client to connect to
-        the feature store
+        """A unique identifier enabling a client to connect to the feature store
+
         :returns: A descriptor encoded as a string"""
         return str(self._storage.serialize())
 
@@ -85,6 +88,7 @@ def from_descriptor(
         descriptor: str,
     ) -> "DragonFeatureStore":
         """A factory method that creates an instance from a descriptor string
+
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached DragonFeatureStore"""
         try:
diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py
index 4531f6696..d511d588e 100644
--- a/smartsim/_core/mli/infrastructure/storage/featurestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py
@@ -50,23 +50,26 @@ class FeatureStore(ABC):
     @abstractmethod
     def __getitem__(self, key: str) -> t.Union[str, bytes]:
         """Retrieve an item using key
+
         :param key: Unique key of an item to retrieve from the feature store"""
 
     @abstractmethod
     def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None:
         """Assign a value using key
+
         :param key: Unique key of an item to set in the feature store
         :param value: Value to persist in the feature store"""
 
     @abstractmethod
     def __contains__(self, key: str) -> bool:
         """Membership operator to test for a key existing within the feature store.
-        Return `True` if the key is found, `False` otherwise
-        :param key: Unique key of an item to retrieve from the feature store"""
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
 
     @property
     @abstractmethod
     def descriptor(self) -> str:
-        """Return a unique identifier enabling a client to connect to
-        the feature store
+        """Unique identifier enabling a client to connect to the feature store
+
         :returns: A descriptor encoded as a string"""
diff --git a/tests/dragon/featurestore.py b/tests/dragon/featurestore.py
index f8c645f6e..d06035fd7 100644
--- a/tests/dragon/featurestore.py
+++ b/tests/dragon/featurestore.py
@@ -43,6 +43,7 @@ def __init__(self) -> None:
 
     def __getitem__(self, key: str) -> bytes:
         """Retrieve an item using key
+
         :param key: Unique key of an item to retrieve from the feature store"""
         if key not in self._storage:
             raise sse.SmartSimError(f"{key} not found in feature store")
@@ -50,20 +51,22 @@ def __getitem__(self, key: str) -> bytes:
 
     def __setitem__(self, key: str, value: bytes) -> None:
         """Membership operator to test for a key existing within the feature store.
-        Return `True` if the key is found, `False` otherwise
-        :param key: Unique key of an item to retrieve from the feature store"""
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
         self._storage[key] = value
 
     def __contains__(self, key: str) -> bool:
         """Membership operator to test for a key existing within the feature store.
-        Return `True` if the key is found, `False` otherwise
-        :param key: Unique key of an item to retrieve from the feature store"""
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
         return key in self._storage
 
     @property
     def descriptor(self) -> str:
-        """Return a unique identifier enabling a client to connect to
-        the feature store
+        """Unique identifier enabling a client to connect to the feature store
+
         :returns: A descriptor encoded as a string"""
         return "file-system-fs"
 
@@ -76,6 +79,7 @@ def __init__(
         self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None
     ) -> None:
         """Initialize the FileSystemFeatureStore instance
+
         :param storage_dir: (optional) root directory to store all data relative to"""
         if isinstance(storage_dir, str):
             storage_dir = pathlib.Path(storage_dir)
@@ -83,6 +87,7 @@ def __init__(
 
     def __getitem__(self, key: str) -> bytes:
         """Retrieve an item using key
+
         :param key: Unique key of an item to retrieve from the feature store"""
         path = self._key_path(key)
         if not path.exists():
@@ -91,6 +96,7 @@ def __getitem__(self, key: str) -> bytes:
 
     def __setitem__(self, key: str, value: bytes) -> None:
         """Assign a value using key
+
         :param key: Unique key of an item to set in the feature store
         :param value: Value to persist in the feature store"""
         path = self._key_path(key, create=True)
@@ -98,14 +104,16 @@ def __setitem__(self, key: str, value: bytes) -> None:
 
     def __contains__(self, key: str) -> bool:
         """Membership operator to test for a key existing within the feature store.
-        Return `True` if the key is found, `False` otherwise
-        :param key: Unique key of an item to retrieve from the feature store"""
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
         path = self._key_path(key)
         return path.exists()
 
     def _key_path(self, key: str, create: bool = False) -> pathlib.Path:
         """Given a key, return a path that is optionally combined with a base
         directory used by the FileSystemFeatureStore.
+
         :param key: Unique key of an item to retrieve from the feature store"""
         value = pathlib.Path(key)
 
@@ -119,8 +127,8 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path:
 
     @property
     def descriptor(self) -> str:
-        """Return a unique identifier enabling a client to connect to
-        the feature store
+        """Unique identifier enabling a client to connect to the feature store
+
         :returns: A descriptor encoded as a string"""
         if not self._storage_dir:
             raise ValueError("No storage path configured")
@@ -132,6 +140,7 @@ def from_descriptor(
         descriptor: str,
     ) -> "FileSystemFeatureStore":
         """A factory method that creates an instance from a descriptor string
+
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached FileSystemFeatureStore"""
         try:
diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py
index 4314b494e..08b659c07 100644
--- a/tests/dragon/utils/channel.py
+++ b/tests/dragon/utils/channel.py
@@ -39,6 +39,7 @@ class FileSystemCommChannel(CommChannelBase):
 
     def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
         """Initialize the FileSystemCommChannel instance
+
         :param key: a path to the root directory of the feature store"""
         self._lock = threading.RLock()
 
@@ -56,6 +57,7 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
 
     def send(self, value: bytes) -> None:
         """Send a message throuh the underlying communication channel
+
         :param value: The value to send"""
         logger.debug(
             f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}"
@@ -65,6 +67,7 @@ def send(self, value: bytes) -> None:
 
     def recv(self) -> bytes:
         """Receieve a message through the underlying communication channel
+
         :returns: the received message"""
         with self._lock:
             if self._file_path.exists():
@@ -78,6 +81,7 @@ def from_descriptor(
         descriptor: t.Union[str, bytes],
     ) -> "FileSystemCommChannel":
         """A factory method that creates an instance from a descriptor string
+
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached FileSystemCommChannel"""
         try:
diff --git a/tests/mli/channel.py b/tests/mli/channel.py
index 9e8acd359..226e8683d 100644
--- a/tests/mli/channel.py
+++ b/tests/mli/channel.py
@@ -39,6 +39,7 @@ class FileSystemCommChannel(CommChannelBase):
 
     def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
         """Initialize the FileSystemCommChannel instance
+
         :param key: a path to the root directory of the feature store"""
         self._lock = threading.RLock()
         if not isinstance(key, bytes):
@@ -55,6 +56,7 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
 
     def send(self, value: bytes) -> None:
         """Send a message throuh the underlying communication channel
+
         :param value: The value to send"""
         logger.debug(
             f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}"
@@ -64,6 +66,7 @@ def send(self, value: bytes) -> None:
 
     def recv(self) -> bytes:
         """Receieve a message through the underlying communication channel
+
         :returns: the received message"""
         with self._lock:
             if self._file_path.exists():
@@ -77,6 +80,7 @@ def from_descriptor(
         descriptor: str,
     ) -> "FileSystemCommChannel":
         """A factory method that creates an instance from a descriptor string
+
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached FileSystemCommChannel"""
         try:
diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py
index 5545168b7..de748ae6e 100644
--- a/tests/mli/featurestore.py
+++ b/tests/mli/featurestore.py
@@ -43,6 +43,7 @@ def __init__(self) -> None:
 
     def __getitem__(self, key: str) -> bytes:
         """Retrieve an item using key
+
         :param key: Unique key of an item to retrieve from the feature store"""
         if key not in self._storage:
             raise sse.SmartSimError(f"{key} not found in feature store")
@@ -50,8 +51,9 @@ def __getitem__(self, key: str) -> bytes:
 
     def __setitem__(self, key: str, value: bytes) -> None:
         """Membership operator to test for a key existing within the feature store.
-        Return `True` if the key is found, `False` otherwise
-        :param key: Unique key of an item to retrieve from the feature store"""
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
         self._storage[key] = value
 
     def __contains__(self, key: str) -> bool:
@@ -62,8 +64,8 @@ def __contains__(self, key: str) -> bool:
 
     @property
     def descriptor(self) -> str:
-        """Return a unique identifier enabling a client to connect to
-        the feature store
+        """Unique identifier enabling a client to connect to the feature store
+
         :returns: A descriptor encoded as a string"""
         return "in-memory-fs"
 
@@ -76,6 +78,7 @@ def __init__(
         self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None
     ) -> None:
         """Initialize the FileSystemFeatureStore instance
+
         :param storage_dir: (optional) root directory to store all data relative to"""
         if isinstance(storage_dir, str):
             storage_dir = pathlib.Path(storage_dir)
@@ -83,6 +86,7 @@ def __init__(
 
     def __getitem__(self, key: str) -> bytes:
         """Retrieve an item using key
+
         :param key: Unique key of an item to retrieve from the feature store"""
         path = self._key_path(key)
         if not path.exists():
@@ -91,6 +95,7 @@ def __getitem__(self, key: str) -> bytes:
 
     def __setitem__(self, key: str, value: bytes) -> None:
         """Assign a value using key
+
         :param key: Unique key of an item to set in the feature store
         :param value: Value to persist in the feature store"""
         path = self._key_path(key, create=True)
@@ -98,14 +103,16 @@ def __setitem__(self, key: str, value: bytes) -> None:
 
     def __contains__(self, key: str) -> bool:
         """Membership operator to test for a key existing within the feature store.
-        Return `True` if the key is found, `False` otherwise
-        :param key: Unique key of an item to retrieve from the feature store"""
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
         path = self._key_path(key)
         return path.exists()
 
     def _key_path(self, key: str, create: bool = False) -> pathlib.Path:
         """Given a key, return a path that is optionally combined with a base
         directory used by the FileSystemFeatureStore.
+
         :param key: Unique key of an item to retrieve from the feature store"""
         value = pathlib.Path(key)
 
@@ -119,8 +126,8 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path:
 
     @property
     def descriptor(self) -> str:
-        """Return a unique identifier enabling a client to connect to
-        the feature store
+        """Unique identifier enabling a client to connect to the feature store
+
         :returns: A descriptor encoded as a string"""
         if not self._storage_dir:
             raise ValueError("No storage path configured")
@@ -132,6 +139,7 @@ def from_descriptor(
         descriptor: str,
     ) -> "FileSystemFeatureStore":
         """A factory method that creates an instance from a descriptor string
+
         :param descriptor: The descriptor that uniquely identifies the resource
         :returns: An attached FileSystemFeatureStore"""
         try:

From 3fd5ed11b1e674b272857f283a257bc38060d221 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Fri, 2 Aug 2024 15:51:01 -0500
Subject: [PATCH 33/49] rename backbone env var

---
 ex/high_throughput_inference/mock_app.py                 | 2 +-
 ex/high_throughput_inference/standalone_workermanager.py | 2 +-
 smartsim/_core/launcher/dragon/dragonBackend.py          | 2 +-
 smartsim/_core/mli/infrastructure/environmentloader.py   | 4 ++--
 tests/dragon/test_environment_loader.py                  | 6 +++---
 tests/dragon/test_error_handling.py                      | 4 ++--
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
index e34b2676a..3a5169a66 100644
--- a/ex/high_throughput_inference/mock_app.py
+++ b/ex/high_throughput_inference/mock_app.py
@@ -56,7 +56,7 @@
 class ProtoClient:
     def __init__(self, timing_on: bool):
         connect_to_infrastructure()
-        ddict_str = os.environ["SS_DRG_DDICT"]
+        ddict_str = os.environ["SS_INFRA_BACKBONE"]
         self._ddict = DDict.attach(ddict_str)
         self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor
         to_worker_fli_str = None
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index 466d2d669..91a425ae4 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -69,7 +69,7 @@
 
     args = parser.parse_args()
     connect_to_infrastructure()
-    ddict_str = os.environ["SS_DRG_DDICT"]
+    ddict_str = os.environ["SS_INFRA_BACKBONE"]
     ddict = DDict.attach(ddict_str)
 
     to_worker_channel = Channel.make_process_local()
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 445538f20..16f5c03dc 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -521,7 +521,7 @@ def _start_steps(self) -> None:
                         env={
                             **request.current_env,
                             **request.env,
-                            "SS_DRG_DDICT": self.infra_ddict,
+                            "SS_INFRA_BACKBONE": self.infra_ddict,
                         },
                         stdout=dragon_process.Popen.PIPE,
                         stderr=dragon_process.Popen.PIPE,
diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index ea8e6b2ad..3f52d8d83 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -72,8 +72,8 @@ def get_backbone(self) -> t.Optional[FeatureStore]:
         an environment variable. The backbone is a standalone, system-created
         feature store used to share internal information among MLI components
 
-        :returns: The attached feature store via SS_DRG_DDICT"""
-        descriptor = os.getenv("SS_DRG_DDICT", "")
+        :returns: The attached feature store via SS_INFRA_BACKBONE"""
+        descriptor = os.getenv("SS_INFRA_BACKBONE", "")
 
         if not descriptor:
             logger.warning("No backbone descriptor is configured")
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index 1b338e1d9..77b400a95 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -101,10 +101,10 @@ def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
 
 
 def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
-    """Verify the dragon feature store is loaded correctly by
-    the EnvironmentConfigLoader to demonstrate fs_factory correctness"""
+    """Verify the dragon feature store is loaded correctly by the
+    EnvironmentConfigLoader to demonstrate featurestore_factory correctness"""
     feature_store = DragonFeatureStore(DDict())
-    monkeypatch.setenv("SS_DRG_DDICT", feature_store.descriptor)
+    monkeypatch.setenv("SS_INFRA_BACKBONE", feature_store.descriptor)
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index 73757014d..e576452b7 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -91,7 +91,7 @@ def setup_worker_manager_model_bytes(
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize()))
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
-    monkeypatch.setenv("SS_DRG_DDICT", backbone_descriptor)
+    monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor)
 
     worker_manager = WorkerManager(
         EnvironmentConfigLoader(
@@ -129,7 +129,7 @@ def setup_worker_manager_model_key(
     queue = FLInterface(main_ch=chan)
     monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize()))
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
-    monkeypatch.setenv("SS_DRG_DDICT", backbone_descriptor)
+    monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor)
 
     worker_manager = WorkerManager(
         EnvironmentConfigLoader(

From 6d4f2e0a84e6b9b7d71492ff9c074ae4186109d2 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Fri, 2 Aug 2024 15:58:09 -0500
Subject: [PATCH 34/49] debug descriptor failure on build agent

---
 tests/dragon/test_environment_loader.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index 77b400a95..46a4a5cb4 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -112,6 +112,8 @@ def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
         queue_factory=None,
     )
 
+    print(f"calling config.get_backbone: `{feature_store.descriptor}`")
+
     backbone = config.get_backbone()
     assert backbone is not None
 

From c75dc5a2d3a04758879e2001b241952dcf5203cb Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Fri, 2 Aug 2024 17:49:27 -0500
Subject: [PATCH 35/49] download and log original asset name on `smart build
 --dragon`

---
 smartsim/_core/_cli/scripts/dragon_install.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index 03a128ab8..2060c73c7 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -5,6 +5,7 @@
 
 from github import Github
 from github.GitReleaseAsset import GitReleaseAsset
+from urllib.request import urlretrieve
 
 from smartsim._core._cli.utils import pip
 from smartsim._core._install.builder import WebTGZ
@@ -163,10 +164,22 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib
     if download_dir.exists() and list(download_dir.rglob("*.whl")):
         return download_dir
 
-    archive = WebTGZ(asset.browser_download_url)
+    download_dir.mkdir(parents=True, exist_ok=True)
+
+    # grab a copy of the complete asset
+    asset_path = download_dir / str(asset.name)
+    download_url = asset.browser_download_url
+    try:
+        urlretrieve(download_url, str(asset_path))
+        logger.debug(f"Retrieved asset {asset.name} to {download_url}")
+    except Exception:
+        logger.warning(f"Unable to download asset from: {download_url}")
+
+    # extract the asset
+    archive = WebTGZ(download_url)
     archive.extract(download_dir)
 
-    logger.debug(f"Retrieved {asset.browser_download_url} to {download_dir}")
+    logger.debug(f"Extracted {download_url} to {download_dir}")
     return download_dir
 
 

From ee07d94934d39a7a1d1816ed7678e6a05449bb8d Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Fri, 2 Aug 2024 18:33:03 -0500
Subject: [PATCH 36/49] test

---
 smartsim/_core/_cli/scripts/dragon_install.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index 2060c73c7..d31df4753 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -2,10 +2,10 @@
 import pathlib
 import sys
 import typing as t
+from urllib.request import urlretrieve
 
 from github import Github
 from github.GitReleaseAsset import GitReleaseAsset
-from urllib.request import urlretrieve
 
 from smartsim._core._cli.utils import pip
 from smartsim._core._install.builder import WebTGZ
@@ -169,11 +169,25 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib
     # grab a copy of the complete asset
     asset_path = download_dir / str(asset.name)
     download_url = asset.browser_download_url
+    if "0.91" not in asset.name:
+        if "3.9" in python_version():
+            logger.debug("I want to snake the original w/3.9 rpm")
+            # download_url = "https://arti.hpc.amslabs.hpecorp.net/ui/native/dragon-rpm-master-local/dev/master/sle15_sp3_pe/x86_64/dragon-0.91-py3.11.5-1d600977c.rpm"
+            ...  # temp no-op
+        elif "3.10" in python_version():
+            logger.debug("snaking original w/3.10 rpm")
+            download_url = "https://drive.usercontent.google.com/download?id=1dyScGNomzoPO8-bC8i6zaIbOOhsL83Sp&export=download&authuser=0&confirm=t&uuid=6068afeb-14fd-4303-90a5-498b316d3cce&at=APZUnTWTIf9Tl7Yt8tcdKyodnydV:1722641072921"
+        elif "3.11" in python_version():
+            logger.debug("snaking original w/3.11rpm")
+            download_url = "https://drive.usercontent.google.com/download?id=1vhUXLIu06-RPA_N3wWmi42avnawzizZZ&export=download&authuser=0&confirm=t&uuid=04c920cb-2e66-4762-8e0f-8ad57e0cbbdf&at=APZUnTUKtCv_BgYOkWAaHqoPpGLd:1722640947383"
+    else:
+        logger.debug(f"the name was: {asset.name}")
+
     try:
         urlretrieve(download_url, str(asset_path))
         logger.debug(f"Retrieved asset {asset.name} to {download_url}")
     except Exception:
-        logger.warning(f"Unable to download asset from: {download_url}")
+        logger.exception(f"Unable to download asset from: {download_url}")
 
     # extract the asset
     archive = WebTGZ(download_url)

From a2691865afed1346cf9bedad32bd027d3dde124d Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Fri, 2 Aug 2024 19:12:01 -0500
Subject: [PATCH 37/49] test

---
 smartsim/_core/_cli/scripts/dragon_install.py |   8 +-
 tests/mli/test_worker_manager.py              |   2 +-
 tests/test_dragon_installer.py                | 108 +++++++++---------
 3 files changed, 60 insertions(+), 58 deletions(-)

diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index d31df4753..48eb750e7 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -1,5 +1,6 @@
 import os
 import pathlib
+import shutil
 import sys
 import typing as t
 from urllib.request import urlretrieve
@@ -161,8 +162,9 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib
 
     # if we've previously downloaded the release and still have
     # wheels laying around, use that cached version instead
-    if download_dir.exists() and list(download_dir.rglob("*.whl")):
-        return download_dir
+    if download_dir.exists() or list(download_dir.rglob("*.whl")):
+        # return download_dir
+        shutil.rmtree(str(download_dir))
 
     download_dir.mkdir(parents=True, exist_ok=True)
 
@@ -185,7 +187,7 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib
 
     try:
         urlretrieve(download_url, str(asset_path))
-        logger.debug(f"Retrieved asset {asset.name} to {download_url}")
+        logger.debug(f"Retrieved asset {asset.name} from {download_url}")
     except Exception:
         logger.exception(f"Unable to download asset from: {download_url}")
 
diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index 026d1f32f..8dcff39f4 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -54,9 +54,9 @@
 from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
 from smartsim._core.mli.message_handler import MessageHandler
 from smartsim.log import get_logger
-from tests.mli.featurestore import FileSystemFeatureStore
 
 from .channel import FileSystemCommChannel
+from .featurestore import FileSystemFeatureStore
 
 logger = get_logger(__name__)
 # The tests in this file belong to the dragon group
diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py
index 4bf589ad4..ea4d3eb55 100644
--- a/tests/test_dragon_installer.py
+++ b/tests/test_dragon_installer.py
@@ -156,60 +156,60 @@ def test_cleanup_archive_exists(test_archive: pathlib.Path) -> None:
     assert not test_archive.exists()
 
 
-def test_retrieve_cached(
-    test_dir: str,
-    # archive_path: pathlib.Path,
-    test_archive: pathlib.Path,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Verify that a previously retrieved asset archive is re-used and the
-    release asset retrieval is not attempted"""
-
-    asset_id = 123
-
-    def mock_webtgz_extract(self_, target_) -> None:
-        mock_extraction_dir = pathlib.Path(target_)
-        with tarfile.TarFile.open(test_archive) as tar:
-            tar.extractall(mock_extraction_dir)
-
-    # we'll use the mock extract to create the files that would normally be downloaded
-    expected_output_dir = test_archive.parent / str(asset_id)
-    mock_webtgz_extract(None, expected_output_dir)
-
-    # get modification time of directory holding the "downloaded" archive
-    ts1 = expected_output_dir.stat().st_ctime
-
-    requester = Requester(
-        auth=None,
-        base_url="https://github.com",
-        user_agent="mozilla",
-        per_page=10,
-        verify=False,
-        timeout=1,
-        retry=1,
-        pool_size=1,
-    )
-    headers = {"mock-header": "mock-value"}
-    attributes = {"mock-attr": "mock-attr-value"}
-    completed = True
-
-    asset = GitReleaseAsset(requester, headers, attributes, completed)
-
-    # ensure mocked asset has values that we use...
-    monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo"))
-    monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name))
-    monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id))
-
-    # show that retrieving an asset w/a different ID results in ignoring
-    # other wheels from prior downloads in the parent directory of the asset
-    asset_path = retrieve_asset(test_archive.parent, asset)
-    ts2 = asset_path.stat().st_ctime
-
-    # NOTE: the file should be written to a subdir based on the asset ID
-    assert (
-        asset_path == expected_output_dir
-    )  # shows that the expected path matches the output path
-    assert ts1 == ts2  # show that the file wasn't changed...
+# def test_retrieve_cached(
+#     test_dir: str,
+#     # archive_path: pathlib.Path,
+#     test_archive: pathlib.Path,
+#     monkeypatch: pytest.MonkeyPatch,
+# ) -> None:
+#     """Verify that a previously retrieved asset archive is re-used and the
+#     release asset retrieval is not attempted"""
+
+#     asset_id = 123
+
+#     def mock_webtgz_extract(self_, target_) -> None:
+#         mock_extraction_dir = pathlib.Path(target_)
+#         with tarfile.TarFile.open(test_archive) as tar:
+#             tar.extractall(mock_extraction_dir)
+
+#     # we'll use the mock extract to create the files that would normally be downloaded
+#     expected_output_dir = test_archive.parent / str(asset_id)
+#     mock_webtgz_extract(None, expected_output_dir)
+
+#     # get modification time of directory holding the "downloaded" archive
+#     ts1 = expected_output_dir.stat().st_ctime
+
+#     requester = Requester(
+#         auth=None,
+#         base_url="https://github.com",
+#         user_agent="mozilla",
+#         per_page=10,
+#         verify=False,
+#         timeout=1,
+#         retry=1,
+#         pool_size=1,
+#     )
+#     headers = {"mock-header": "mock-value"}
+#     attributes = {"mock-attr": "mock-attr-value"}
+#     completed = True
+
+#     asset = GitReleaseAsset(requester, headers, attributes, completed)
+
+#     # ensure mocked asset has values that we use...
+#     monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo"))
+#     monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name))
+#     monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id))
+
+#     # show that retrieving an asset w/a different ID results in ignoring
+#     # other wheels from prior downloads in the parent directory of the asset
+#     asset_path = retrieve_asset(test_archive.parent, asset)
+#     ts2 = asset_path.stat().st_ctime
+
+#     # NOTE: the file should be written to a subdir based on the asset ID
+#     assert (
+#         asset_path == expected_output_dir
+#     )  # shows that the expected path matches the output path
+#     assert ts1 == ts2  # show that the file wasn't changed...
 
 
 def test_retrieve_updated(

From e75a18fa318bfc20a08bba4183ae146189b5bb61 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Fri, 2 Aug 2024 19:46:11 -0500
Subject: [PATCH 38/49] remove test_worker_Manager

---
 tests/mli/test_worker_manager.py | 418 +++++++++++++++----------------
 1 file changed, 209 insertions(+), 209 deletions(-)

diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index 8dcff39f4..ae764591f 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -1,209 +1,209 @@
-# BSD 2-Clause License
-#
-# Copyright (c) 2021-2024, Hewlett Packard Enterprise
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import io
-import logging
-import multiprocessing as mp
-import pathlib
-import time
-
-import pytest
-
-torch = pytest.importorskip("torch")
-dragon = pytest.importorskip("dragon")
-
-import base64
-import os
-
-import dragon.channels as dch
-from dragon import fli
-
-from smartsim._core.mli.comm.channel.channel import CommChannelBase
-from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
-from smartsim._core.mli.infrastructure.control.workermanager import (
-    EnvironmentConfigLoader,
-    WorkerManager,
-)
-from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
-    DragonFeatureStore,
-)
-from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
-from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
-from smartsim._core.mli.message_handler import MessageHandler
-from smartsim.log import get_logger
-
-from .channel import FileSystemCommChannel
-from .featurestore import FileSystemFeatureStore
-
-logger = get_logger(__name__)
-# The tests in this file belong to the dragon group
-pytestmark = pytest.mark.dragon
-
-
-def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
-    """Create a simple torch model and persist to disk for
-    testing purposes.
-
-    TODO: remove once unit tests are in place"""
-    # test_path = pathlib.Path(work_dir)
-    if not model_path.parent.exists():
-        model_path.parent.mkdir(parents=True, exist_ok=True)
-
-    model_path.unlink(missing_ok=True)
-    # model_path = test_path / "basic.pt"
-
-    model = torch.nn.Linear(2, 1)
-    torch.save(model, model_path)
-
-    return model_path
-
-
-def mock_messages(
-    worker_manager_queue: CommChannelBase,
-    feature_store: FeatureStore,
-    feature_store_root_dir: pathlib.Path,
-    comm_channel_root_dir: pathlib.Path,
-) -> None:
-    """Mock event producer for triggering the inference pipeline"""
-    feature_store_root_dir.mkdir(parents=True, exist_ok=True)
-    comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
-
-    model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt")
-    model_bytes = model_path.read_bytes()
-    model_key = str(feature_store_root_dir / "model_fs.pt")
-
-    feature_store[model_key] = model_bytes
-
-    iteration_number = 0
-
-    while True:
-        iteration_number += 1
-        time.sleep(1)
-        # 1. for demo, ignore upstream and just put stuff into downstream
-        # 2. for demo, only one downstream but we'd normally have to filter
-        #       msg content and send to the correct downstream (worker) queue
-        # timestamp = time.time_ns()
-        # mock_channel = test_path / f"brainstorm-{timestamp}.txt"
-        # mock_channel.touch()
-
-        # thread - just look for key (wait for keys)
-        # call checkpoint, try to get non-persistent key, it blocks
-        # working set size > 1 has side-effects
-        # only incurs cost when working set size has been exceeded
-
-        channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt"
-        callback_channel = FileSystemCommChannel(pathlib.Path(channel_key))
-
-        input_path = feature_store_root_dir / f"{iteration_number}/input.pt"
-        output_path = feature_store_root_dir / f"{iteration_number}/output.pt"
-
-        input_key = str(input_path)
-        output_key = str(output_path)
-
-        buffer = io.BytesIO()
-        tensor = torch.randn((1, 2), dtype=torch.float32)
-        torch.save(tensor, buffer)
-        feature_store[input_key] = buffer.getvalue()
-        fsd = feature_store.descriptor
-
-        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
-        message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd)
-        message_model_key = MessageHandler.build_model_key(model_key, fsd)
-
-        request = MessageHandler.build_request(
-            reply_channel=callback_channel.descriptor,
-            model=message_model_key,
-            inputs=[message_tensor_input_key],
-            outputs=[message_tensor_output_key],
-            output_descriptors=[],
-            custom_attributes=None,
-        )
-        request_bytes = MessageHandler.serialize_request(request)
-        worker_manager_queue.send(request_bytes)
-
-
-@pytest.fixture
-def prepare_environment(test_dir: str) -> pathlib.Path:
-    """Cleanup prior outputs to run demo repeatedly"""
-    path = pathlib.Path(f"{test_dir}/workermanager.log")
-    logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
-    return path
-
-
-def test_worker_manager(prepare_environment: pathlib.Path) -> None:
-    """Test the worker manager"""
-
-    test_path = prepare_environment
-    fs_path = test_path / "feature_store"
-    comm_path = test_path / "comm_store"
-
-    to_worker_channel = dch.Channel.make_process_local()
-    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-    to_worker_fli_serialized = to_worker_fli.serialize()
-
-    # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
-    # or test environment may be unable to send messages w/queue
-    os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
-
-    config_loader = EnvironmentConfigLoader(
-        featurestore_factory=DragonFeatureStore.from_descriptor,
-        callback_factory=FileSystemCommChannel.from_descriptor,
-        queue_factory=DragonFLIChannel.from_descriptor,
-    )
-    integrated_worker = TorchWorker()
-
-    worker_manager = WorkerManager(
-        config_loader,
-        integrated_worker,
-        as_service=True,
-        cooldown=5,
-        device="cpu",
-    )
-
-    worker_queue = config_loader.get_queue()
-    if worker_queue is None:
-        logger.warn(
-            f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}"
-        )
-
-    # create a mock client application to populate the request queue
-    msg_pump = mp.Process(
-        target=mock_messages,
-        args=(
-            worker_queue,
-            FileSystemFeatureStore(fs_path),
-            fs_path,
-            comm_path,
-        ),
-    )
-    msg_pump.start()
-
-    # create a process to execute commands
-    process = mp.Process(target=worker_manager.execute)
-    process.start()
-    process.join(timeout=5)
-    process.kill()
-    msg_pump.kill()
+# # BSD 2-Clause License
+# #
+# # Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# # All rights reserved.
+# #
+# # Redistribution and use in source and binary forms, with or without
+# # modification, are permitted provided that the following conditions are met:
+# #
+# # 1. Redistributions of source code must retain the above copyright notice, this
+# #    list of conditions and the following disclaimer.
+# #
+# # 2. Redistributions in binary form must reproduce the above copyright notice,
+# #    this list of conditions and the following disclaimer in the documentation
+# #    and/or other materials provided with the distribution.
+# #
+# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# import io
+# import logging
+# import multiprocessing as mp
+# import pathlib
+# import time
+
+# import pytest
+
+# torch = pytest.importorskip("torch")
+# dragon = pytest.importorskip("dragon")
+
+# import base64
+# import os
+
+# import dragon.channels as dch
+# from dragon import fli
+
+# from smartsim._core.mli.comm.channel.channel import CommChannelBase
+# from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+# from smartsim._core.mli.infrastructure.control.workermanager import (
+#     EnvironmentConfigLoader,
+#     WorkerManager,
+# )
+# from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+#     DragonFeatureStore,
+# )
+# from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+# from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+# from smartsim._core.mli.message_handler import MessageHandler
+# from smartsim.log import get_logger
+
+# from .channel import FileSystemCommChannel
+# from .featurestore import FileSystemFeatureStore
+
+# logger = get_logger(__name__)
+# # The tests in this file belong to the dragon group
+# pytestmark = pytest.mark.dragon
+
+
+# def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
+#     """Create a simple torch model and persist to disk for
+#     testing purposes.
+
+#     TODO: remove once unit tests are in place"""
+#     # test_path = pathlib.Path(work_dir)
+#     if not model_path.parent.exists():
+#         model_path.parent.mkdir(parents=True, exist_ok=True)
+
+#     model_path.unlink(missing_ok=True)
+#     # model_path = test_path / "basic.pt"
+
+#     model = torch.nn.Linear(2, 1)
+#     torch.save(model, model_path)
+
+#     return model_path
+
+
+# def mock_messages(
+#     worker_manager_queue: CommChannelBase,
+#     feature_store: FeatureStore,
+#     feature_store_root_dir: pathlib.Path,
+#     comm_channel_root_dir: pathlib.Path,
+# ) -> None:
+#     """Mock event producer for triggering the inference pipeline"""
+#     feature_store_root_dir.mkdir(parents=True, exist_ok=True)
+#     comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
+
+#     model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt")
+#     model_bytes = model_path.read_bytes()
+#     model_key = str(feature_store_root_dir / "model_fs.pt")
+
+#     feature_store[model_key] = model_bytes
+
+#     iteration_number = 0
+
+#     while True:
+#         iteration_number += 1
+#         time.sleep(1)
+#         # 1. for demo, ignore upstream and just put stuff into downstream
+#         # 2. for demo, only one downstream but we'd normally have to filter
+#         #       msg content and send to the correct downstream (worker) queue
+#         # timestamp = time.time_ns()
+#         # mock_channel = test_path / f"brainstorm-{timestamp}.txt"
+#         # mock_channel.touch()
+
+#         # thread - just look for key (wait for keys)
+#         # call checkpoint, try to get non-persistent key, it blocks
+#         # working set size > 1 has side-effects
+#         # only incurs cost when working set size has been exceeded
+
+#         channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt"
+#         callback_channel = FileSystemCommChannel(pathlib.Path(channel_key))
+
+#         input_path = feature_store_root_dir / f"{iteration_number}/input.pt"
+#         output_path = feature_store_root_dir / f"{iteration_number}/output.pt"
+
+#         input_key = str(input_path)
+#         output_key = str(output_path)
+
+#         buffer = io.BytesIO()
+#         tensor = torch.randn((1, 2), dtype=torch.float32)
+#         torch.save(tensor, buffer)
+#         feature_store[input_key] = buffer.getvalue()
+#         fsd = feature_store.descriptor
+
+#         message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
+#         message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd)
+#         message_model_key = MessageHandler.build_model_key(model_key, fsd)
+
+#         request = MessageHandler.build_request(
+#             reply_channel=callback_channel.descriptor,
+#             model=message_model_key,
+#             inputs=[message_tensor_input_key],
+#             outputs=[message_tensor_output_key],
+#             output_descriptors=[],
+#             custom_attributes=None,
+#         )
+#         request_bytes = MessageHandler.serialize_request(request)
+#         worker_manager_queue.send(request_bytes)
+
+
+# @pytest.fixture
+# def prepare_environment(test_dir: str) -> pathlib.Path:
+#     """Cleanup prior outputs to run demo repeatedly"""
+#     path = pathlib.Path(f"{test_dir}/workermanager.log")
+#     logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
+#     return path
+
+
+# def test_worker_manager(prepare_environment: pathlib.Path) -> None:
+#     """Test the worker manager"""
+
+#     test_path = prepare_environment
+#     fs_path = test_path / "feature_store"
+#     comm_path = test_path / "comm_store"
+
+#     to_worker_channel = dch.Channel.make_process_local()
+#     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+#     to_worker_fli_serialized = to_worker_fli.serialize()
+
+#     # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
+#     # or test environment may be unable to send messages w/queue
+#     os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+
+#     config_loader = EnvironmentConfigLoader(
+#         featurestore_factory=DragonFeatureStore.from_descriptor,
+#         callback_factory=FileSystemCommChannel.from_descriptor,
+#         queue_factory=DragonFLIChannel.from_descriptor,
+#     )
+#     integrated_worker = TorchWorker()
+
+#     worker_manager = WorkerManager(
+#         config_loader,
+#         integrated_worker,
+#         as_service=True,
+#         cooldown=5,
+#         device="cpu",
+#     )
+
+#     worker_queue = config_loader.get_queue()
+#     if worker_queue is None:
+#         logger.warn(
+#             f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}"
+#         )
+
+#     # create a mock client application to populate the request queue
+#     msg_pump = mp.Process(
+#         target=mock_messages,
+#         args=(
+#             worker_queue,
+#             FileSystemFeatureStore(fs_path),
+#             fs_path,
+#             comm_path,
+#         ),
+#     )
+#     msg_pump.start()
+
+#     # create a process to execute commands
+#     process = mp.Process(target=worker_manager.execute)
+#     process.start()
+#     process.join(timeout=5)
+#     process.kill()
+#     msg_pump.kill()

From 125dc840052c051c51d03336f9e0027dc3321984 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Mon, 5 Aug 2024 15:37:05 -0500
Subject: [PATCH 39/49] add test_worker_manager back into test set

---
 tests/mli/test_worker_manager.py | 417 +++++++++++++++----------------
 1 file changed, 208 insertions(+), 209 deletions(-)

diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index ae764591f..51f445885 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -1,209 +1,208 @@
-# # BSD 2-Clause License
-# #
-# # Copyright (c) 2021-2024, Hewlett Packard Enterprise
-# # All rights reserved.
-# #
-# # Redistribution and use in source and binary forms, with or without
-# # modification, are permitted provided that the following conditions are met:
-# #
-# # 1. Redistributions of source code must retain the above copyright notice, this
-# #    list of conditions and the following disclaimer.
-# #
-# # 2. Redistributions in binary form must reproduce the above copyright notice,
-# #    this list of conditions and the following disclaimer in the documentation
-# #    and/or other materials provided with the distribution.
-# #
-# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# import io
-# import logging
-# import multiprocessing as mp
-# import pathlib
-# import time
-
-# import pytest
-
-# torch = pytest.importorskip("torch")
-# dragon = pytest.importorskip("dragon")
-
-# import base64
-# import os
-
-# import dragon.channels as dch
-# from dragon import fli
-
-# from smartsim._core.mli.comm.channel.channel import CommChannelBase
-# from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
-# from smartsim._core.mli.infrastructure.control.workermanager import (
-#     EnvironmentConfigLoader,
-#     WorkerManager,
-# )
-# from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
-#     DragonFeatureStore,
-# )
-# from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
-# from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
-# from smartsim._core.mli.message_handler import MessageHandler
-# from smartsim.log import get_logger
-
-# from .channel import FileSystemCommChannel
-# from .featurestore import FileSystemFeatureStore
-
-# logger = get_logger(__name__)
-# # The tests in this file belong to the dragon group
-# pytestmark = pytest.mark.dragon
-
-
-# def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
-#     """Create a simple torch model and persist to disk for
-#     testing purposes.
-
-#     TODO: remove once unit tests are in place"""
-#     # test_path = pathlib.Path(work_dir)
-#     if not model_path.parent.exists():
-#         model_path.parent.mkdir(parents=True, exist_ok=True)
-
-#     model_path.unlink(missing_ok=True)
-#     # model_path = test_path / "basic.pt"
-
-#     model = torch.nn.Linear(2, 1)
-#     torch.save(model, model_path)
-
-#     return model_path
-
-
-# def mock_messages(
-#     worker_manager_queue: CommChannelBase,
-#     feature_store: FeatureStore,
-#     feature_store_root_dir: pathlib.Path,
-#     comm_channel_root_dir: pathlib.Path,
-# ) -> None:
-#     """Mock event producer for triggering the inference pipeline"""
-#     feature_store_root_dir.mkdir(parents=True, exist_ok=True)
-#     comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
-
-#     model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt")
-#     model_bytes = model_path.read_bytes()
-#     model_key = str(feature_store_root_dir / "model_fs.pt")
-
-#     feature_store[model_key] = model_bytes
-
-#     iteration_number = 0
-
-#     while True:
-#         iteration_number += 1
-#         time.sleep(1)
-#         # 1. for demo, ignore upstream and just put stuff into downstream
-#         # 2. for demo, only one downstream but we'd normally have to filter
-#         #       msg content and send to the correct downstream (worker) queue
-#         # timestamp = time.time_ns()
-#         # mock_channel = test_path / f"brainstorm-{timestamp}.txt"
-#         # mock_channel.touch()
-
-#         # thread - just look for key (wait for keys)
-#         # call checkpoint, try to get non-persistent key, it blocks
-#         # working set size > 1 has side-effects
-#         # only incurs cost when working set size has been exceeded
-
-#         channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt"
-#         callback_channel = FileSystemCommChannel(pathlib.Path(channel_key))
-
-#         input_path = feature_store_root_dir / f"{iteration_number}/input.pt"
-#         output_path = feature_store_root_dir / f"{iteration_number}/output.pt"
-
-#         input_key = str(input_path)
-#         output_key = str(output_path)
-
-#         buffer = io.BytesIO()
-#         tensor = torch.randn((1, 2), dtype=torch.float32)
-#         torch.save(tensor, buffer)
-#         feature_store[input_key] = buffer.getvalue()
-#         fsd = feature_store.descriptor
-
-#         message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
-#         message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd)
-#         message_model_key = MessageHandler.build_model_key(model_key, fsd)
-
-#         request = MessageHandler.build_request(
-#             reply_channel=callback_channel.descriptor,
-#             model=message_model_key,
-#             inputs=[message_tensor_input_key],
-#             outputs=[message_tensor_output_key],
-#             output_descriptors=[],
-#             custom_attributes=None,
-#         )
-#         request_bytes = MessageHandler.serialize_request(request)
-#         worker_manager_queue.send(request_bytes)
-
-
-# @pytest.fixture
-# def prepare_environment(test_dir: str) -> pathlib.Path:
-#     """Cleanup prior outputs to run demo repeatedly"""
-#     path = pathlib.Path(f"{test_dir}/workermanager.log")
-#     logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
-#     return path
-
-
-# def test_worker_manager(prepare_environment: pathlib.Path) -> None:
-#     """Test the worker manager"""
-
-#     test_path = prepare_environment
-#     fs_path = test_path / "feature_store"
-#     comm_path = test_path / "comm_store"
-
-#     to_worker_channel = dch.Channel.make_process_local()
-#     to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
-#     to_worker_fli_serialized = to_worker_fli.serialize()
-
-#     # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
-#     # or test environment may be unable to send messages w/queue
-#     os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
-
-#     config_loader = EnvironmentConfigLoader(
-#         featurestore_factory=DragonFeatureStore.from_descriptor,
-#         callback_factory=FileSystemCommChannel.from_descriptor,
-#         queue_factory=DragonFLIChannel.from_descriptor,
-#     )
-#     integrated_worker = TorchWorker()
-
-#     worker_manager = WorkerManager(
-#         config_loader,
-#         integrated_worker,
-#         as_service=True,
-#         cooldown=5,
-#         device="cpu",
-#     )
-
-#     worker_queue = config_loader.get_queue()
-#     if worker_queue is None:
-#         logger.warn(
-#             f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}"
-#         )
-
-#     # create a mock client application to populate the request queue
-#     msg_pump = mp.Process(
-#         target=mock_messages,
-#         args=(
-#             worker_queue,
-#             FileSystemFeatureStore(fs_path),
-#             fs_path,
-#             comm_path,
-#         ),
-#     )
-#     msg_pump.start()
-
-#     # create a process to execute commands
-#     process = mp.Process(target=worker_manager.execute)
-#     process.start()
-#     process.join(timeout=5)
-#     process.kill()
-#     msg_pump.kill()
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import logging
+import multiprocessing as mp
+import pathlib
+import time
+
+import pytest
+
+torch = pytest.importorskip("torch")
+dragon = pytest.importorskip("dragon")
+
+import base64
+import os
+
+import dragon.channels as dch
+from channel import FileSystemCommChannel
+from dragon import fli
+from featurestore import FileSystemFeatureStore
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.control.workermanager import (
+    EnvironmentConfigLoader,
+    WorkerManager,
+)
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
+    """Create a simple torch model and persist to disk for
+    testing purposes.
+
+    TODO: remove once unit tests are in place"""
+    # test_path = pathlib.Path(work_dir)
+    if not model_path.parent.exists():
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+
+    model_path.unlink(missing_ok=True)
+    # model_path = test_path / "basic.pt"
+
+    model = torch.nn.Linear(2, 1)
+    torch.save(model, model_path)
+
+    return model_path
+
+
+def mock_messages(
+    worker_manager_queue: CommChannelBase,
+    feature_store: FeatureStore,
+    feature_store_root_dir: pathlib.Path,
+    comm_channel_root_dir: pathlib.Path,
+) -> None:
+    """Mock event producer for triggering the inference pipeline"""
+    feature_store_root_dir.mkdir(parents=True, exist_ok=True)
+    comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
+
+    model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt")
+    model_bytes = model_path.read_bytes()
+    model_key = str(feature_store_root_dir / "model_fs.pt")
+
+    feature_store[model_key] = model_bytes
+
+    iteration_number = 0
+
+    while True:
+        iteration_number += 1
+        time.sleep(1)
+        # 1. for demo, ignore upstream and just put stuff into downstream
+        # 2. for demo, only one downstream but we'd normally have to filter
+        #       msg content and send to the correct downstream (worker) queue
+        # timestamp = time.time_ns()
+        # mock_channel = test_path / f"brainstorm-{timestamp}.txt"
+        # mock_channel.touch()
+
+        # thread - just look for key (wait for keys)
+        # call checkpoint, try to get non-persistent key, it blocks
+        # working set size > 1 has side-effects
+        # only incurs cost when working set size has been exceeded
+
+        channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt"
+        callback_channel = FileSystemCommChannel(pathlib.Path(channel_key))
+
+        input_path = feature_store_root_dir / f"{iteration_number}/input.pt"
+        output_path = feature_store_root_dir / f"{iteration_number}/output.pt"
+
+        input_key = str(input_path)
+        output_key = str(output_path)
+
+        buffer = io.BytesIO()
+        tensor = torch.randn((1, 2), dtype=torch.float32)
+        torch.save(tensor, buffer)
+        feature_store[input_key] = buffer.getvalue()
+        fsd = feature_store.descriptor
+
+        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
+        message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd)
+        message_model_key = MessageHandler.build_model_key(model_key, fsd)
+
+        request = MessageHandler.build_request(
+            reply_channel=callback_channel.descriptor,
+            model=message_model_key,
+            inputs=[message_tensor_input_key],
+            outputs=[message_tensor_output_key],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+        request_bytes = MessageHandler.serialize_request(request)
+        worker_manager_queue.send(request_bytes)
+
+
+@pytest.fixture
+def prepare_environment(test_dir: str) -> pathlib.Path:
+    """Cleanup prior outputs to run demo repeatedly"""
+    path = pathlib.Path(f"{test_dir}/workermanager.log")
+    logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
+    return path
+
+
+def test_worker_manager(prepare_environment: pathlib.Path) -> None:
+    """Test the worker manager"""
+
+    test_path = prepare_environment
+    fs_path = test_path / "feature_store"
+    comm_path = test_path / "comm_store"
+
+    to_worker_channel = dch.Channel.make_process_local()
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    to_worker_fli_serialized = to_worker_fli.serialize()
+
+    # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
+    # or test environment may be unable to send messages w/queue
+    os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    integrated_worker = TorchWorker()
+
+    worker_manager = WorkerManager(
+        config_loader,
+        integrated_worker,
+        as_service=True,
+        cooldown=5,
+        device="cpu",
+    )
+
+    worker_queue = config_loader.get_queue()
+    if worker_queue is None:
+        logger.warn(
+            f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}"
+        )
+
+    # create a mock client application to populate the request queue
+    msg_pump = mp.Process(
+        target=mock_messages,
+        args=(
+            worker_queue,
+            FileSystemFeatureStore(fs_path),
+            fs_path,
+            comm_path,
+        ),
+    )
+    msg_pump.start()
+
+    # create a process to execute commands
+    process = mp.Process(target=worker_manager.execute)
+    process.start()
+    process.join(timeout=5)
+    process.kill()
+    msg_pump.kill()

From 783294ad0d0b18bde7fab7e9a2226ebdadd9bb88 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Tue, 6 Aug 2024 10:10:09 -0500
Subject: [PATCH 40/49] rename SSQueue env var to SS_QUEUE

---
 ex/high_throughput_inference/standalone_workermanager.py | 2 +-
 smartsim/_core/mli/infrastructure/environmentloader.py   | 4 ++--
 tests/dragon/test_environment_loader.py                  | 6 +++---
 tests/dragon/test_error_handling.py                      | 4 ++--
 tests/mli/test_worker_manager.py                         | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index 91a425ae4..62b930b8a 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -80,7 +80,7 @@
     worker_type_name = base64.b64decode(args.worker_class.encode("ascii"))
     torch_worker = cloudpickle.loads(worker_type_name)()
 
-    os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+    os.environ["SS_QUEUE"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index 3f52d8d83..762b00769 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -90,8 +90,8 @@ def get_queue(self) -> t.Optional[CommChannelBase]:
         """Attach to a queue-like communication channel using the descriptor
         found in an environment variable.
 
-        :returns: The attached queue specified via SSQueue"""
-        descriptor = os.getenv("SSQueue", "")
+        :returns: The attached queue specified via SS_QUEUE"""
+        descriptor = os.getenv("SS_QUEUE", "")
 
         if not descriptor:
             logger.warning("No queue descriptor is configured")
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index 46a4a5cb4..12c089792 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -55,7 +55,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke
     """A descriptor can be stored, loaded, and reattached"""
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize()))
+    monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize()))
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -76,7 +76,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
     queue are the same"""
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize()))
+    monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize()))
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -89,7 +89,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
 
 def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
     """An incorrect serialized descriptor will fails to attach"""
-    monkeypatch.setenv("SSQueue", "randomstring")
+    monkeypatch.setenv("SS_QUEUE", "randomstring")
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=None,
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index e576452b7..febe75ce9 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -89,7 +89,7 @@ def setup_worker_manager_model_bytes(
 
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize()))
+    monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize()))
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
     monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor)
 
@@ -127,7 +127,7 @@ def setup_worker_manager_model_key(
 
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize()))
+    monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize()))
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
     monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor)
 
diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index 51f445885..380c6b06e 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -165,7 +165,7 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
 
     # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
     # or test environment may be unable to send messages w/queue
-    os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+    os.environ["SS_QUEUE"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,

From 9bce16a275c30004995d038cd6268c39540791a3 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Tue, 6 Aug 2024 10:14:25 -0500
Subject: [PATCH 41/49] remove commented code, rename variable for clarity

---
 .../_core/mli/infrastructure/control/workermanager.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index 6f52d5364..aa30b019f 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -121,7 +121,7 @@ def __init__(
         """Dictionary of previously loaded models"""
         self._feature_stores: t.Dict[str, FeatureStore] = {}
         """A collection of attached feature stores"""
-        self._fs_factory = config_loader._featurestore_factory
+        self._featurestore_factory = config_loader._featurestore_factory
         """A factory method to create a desired feature store client type"""
         self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone()
         """A standalone, system-created feature store used to share internal
@@ -145,7 +145,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool:
         fs_actual = {item.descriptor for item in self._feature_stores.values()}
         fs_missing = fs_desired - fs_actual
 
-        if self._fs_factory is None:
+        if self._featurestore_factory is None:
             logger.warning("No feature store factory configured")
             return False
 
@@ -153,7 +153,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool:
         if fs_missing:
             logger.debug(f"Adding feature store(s): {fs_missing}")
             for descriptor in fs_missing:
-                feature_store = self._fs_factory(descriptor)
+                feature_store = self._featurestore_factory(descriptor)
                 self._feature_stores[descriptor] = feature_store
 
         return True
@@ -262,11 +262,6 @@ def _on_iteration(self) -> None:
                 )
                 return
 
-            # if request.model_key.descriptor not in self._feature_stores:
-            #     self._fs_factory(request.model_key.descriptor)
-            # todo: decide if we should load here or in _check_feature_stores.
-            # todo: should i raise error here?
-
             if request.model_key.key in self._cached_models:
                 timings.append(time.perf_counter() - interm)  # timing
                 interm = time.perf_counter()  # timing

From 446d0008d1ac329119fc919000b19fcbb605ce7b Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Tue, 6 Aug 2024 10:20:22 -0500
Subject: [PATCH 42/49] rename ss_queue -> ss_request_queue

---
 ex/high_throughput_inference/standalone_workermanager.py | 3 ++-
 smartsim/_core/mli/infrastructure/environmentloader.py   | 4 ++--
 tests/dragon/test_environment_loader.py                  | 6 +++---
 tests/dragon/test_error_handling.py                      | 4 ++--
 tests/mli/test_worker_manager.py                         | 3 ++-
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
index 62b930b8a..2b5ba7df4 100644
--- a/ex/high_throughput_inference/standalone_workermanager.py
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -80,7 +80,8 @@
     worker_type_name = base64.b64decode(args.worker_class.encode("ascii"))
     torch_worker = cloudpickle.loads(worker_type_name)()
 
-    os.environ["SS_QUEUE"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+    descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+    os.environ["SS_REQUEST_QUEUE"] = descriptor
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
index 762b00769..b4b9e565c 100644
--- a/smartsim/_core/mli/infrastructure/environmentloader.py
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -90,8 +90,8 @@ def get_queue(self) -> t.Optional[CommChannelBase]:
         """Attach to a queue-like communication channel using the descriptor
         found in an environment variable.
 
-        :returns: The attached queue specified via SS_QUEUE"""
-        descriptor = os.getenv("SS_QUEUE", "")
+        :returns: The attached queue specified via `SS_REQUEST_QUEUE`"""
+        descriptor = os.getenv("SS_REQUEST_QUEUE", "")
 
         if not descriptor:
             logger.warning("No queue descriptor is configured")
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
index 12c089792..6ae5d2b30 100644
--- a/tests/dragon/test_environment_loader.py
+++ b/tests/dragon/test_environment_loader.py
@@ -55,7 +55,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke
     """A descriptor can be stored, loaded, and reattached"""
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize()))
+    monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()))
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -76,7 +76,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
     queue are the same"""
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize()))
+    monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()))
 
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
@@ -89,7 +89,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
 
 def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
     """An incorrect serialized descriptor will fails to attach"""
-    monkeypatch.setenv("SS_QUEUE", "randomstring")
+    monkeypatch.setenv("SS_REQUEST_QUEUE", "randomstring")
     config = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,
         callback_factory=None,
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
index febe75ce9..208ab1e5e 100644
--- a/tests/dragon/test_error_handling.py
+++ b/tests/dragon/test_error_handling.py
@@ -89,7 +89,7 @@ def setup_worker_manager_model_bytes(
 
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize()))
+    monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()))
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
     monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor)
 
@@ -127,7 +127,7 @@ def setup_worker_manager_model_key(
 
     chan = Channel.make_process_local()
     queue = FLInterface(main_ch=chan)
-    monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize()))
+    monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()))
     # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
     monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor)
 
diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py
index 380c6b06e..cf385354e 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/mli/test_worker_manager.py
@@ -165,7 +165,8 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None:
 
     # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
     # or test environment may be unable to send messages w/queue
-    os.environ["SS_QUEUE"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+    descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+    os.environ["SS_REQUEST_QUEUE"] = descriptor
 
     config_loader = EnvironmentConfigLoader(
         featurestore_factory=DragonFeatureStore.from_descriptor,

From 989db29a77c4eb2de917ebc651e2c380ae018cb9 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Tue, 6 Aug 2024 10:33:43 -0500
Subject: [PATCH 43/49] replaced log.warning w/log.error on missing components

---
 smartsim/_core/mli/infrastructure/control/workermanager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
index aa30b019f..dcc35ae83 100644
--- a/smartsim/_core/mli/infrastructure/control/workermanager.py
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -146,7 +146,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool:
         fs_missing = fs_desired - fs_actual
 
         if self._featurestore_factory is None:
-            logger.warning("No feature store factory configured")
+            logger.error("No feature store factory configured")
             return False
 
         # create the feature stores we need to service request
@@ -215,7 +215,7 @@ def _on_iteration(self) -> None:
         logger.debug("executing worker manager pipeline")
 
         if self._task_queue is None:
-            logger.warning("No queue to check for tasks")
+            logger.error("No queue to check for tasks")
             return
 
         timings = []  # timing

From 3e8d6ebc6b4722642af45b1be88c58e6ff96ad1e Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Tue, 6 Aug 2024 10:43:07 -0500
Subject: [PATCH 44/49] improve DragonFeatureStore docstrings

---
 .../storage/dragonfeaturestore.py             | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
index 5f42ef0bd..012f3cb2e 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
@@ -43,18 +43,24 @@ class DragonFeatureStore(FeatureStore):
     """A feature store backed by a dragon distributed dictionary"""
 
     def __init__(self, storage: "dragon_ddict.DDict") -> None:
-        """Initialize the DragonFeatureStore instance"""
+        """Initialize the DragonFeatureStore instance
+
+        :param storage: A distributed dictionary to be used as the underlying
+        storage mechanism of the feature store"""
         self._storage = storage
 
     def __getitem__(self, key: str) -> t.Union[str, bytes]:
         """Retrieve an item using key
 
-        :param key: Unique key of an item to retrieve from the feature store"""
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: The value identified by the supplied key
+        :raises KeyError: if the key is not found in the feature store
+        :raises SmartSimError: if retrieval from the feature store fails"""
         try:
             value: t.Union[str, bytes] = self._storage[key]
             return value
         except KeyError as ex:
-            raise ex
+            raise
         except Exception as ex:
             # note: explicitly avoid round-trip to check for key existence
             raise SmartSimError(
@@ -90,9 +96,12 @@ def from_descriptor(
         """A factory method that creates an instance from a descriptor string
 
         :param descriptor: The descriptor that uniquely identifies the resource
-        :returns: An attached DragonFeatureStore"""
+        :returns: An attached DragonFeatureStore
+        :raises SmartSimError: if attachment to DragonFeatureStore fails"""
         try:
             return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor))
-        except:
+        except Exception as ex:
             logger.error(f"Error creating dragon feature store: {descriptor}")
-            raise
+            raise SmartSimError(
+                f"Error creating dragon feature store: {descriptor}"
+            ) from ex

From 0344398ebf870ed4c37b59b70aa0c05e47112799 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Tue, 6 Aug 2024 11:08:43 -0500
Subject: [PATCH 45/49] ensure KeyError is logged

---
 .../_core/mli/infrastructure/storage/dragonfeaturestore.py     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
index 012f3cb2e..e89abcd2a 100644
--- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
+++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
@@ -59,7 +59,8 @@ def __getitem__(self, key: str) -> t.Union[str, bytes]:
         try:
             value: t.Union[str, bytes] = self._storage[key]
             return value
-        except KeyError as ex:
+        except KeyError:
+            logger.warning(f"An unknown key was requested: {key}")
             raise
         except Exception as ex:
             # note: explicitly avoid round-trip to check for key existence

From d040e289a5662176ef9db76a8686ed9b12706866 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Tue, 6 Aug 2024 15:37:01 -0500
Subject: [PATCH 46/49] move dragon-based test into correct subdir

---
 tests/{mli => dragon}/test_worker_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename tests/{mli => dragon}/test_worker_manager.py (98%)

diff --git a/tests/mli/test_worker_manager.py b/tests/dragon/test_worker_manager.py
similarity index 98%
rename from tests/mli/test_worker_manager.py
rename to tests/dragon/test_worker_manager.py
index cf385354e..57585aac9 100644
--- a/tests/mli/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -39,9 +39,9 @@
 import os
 
 import dragon.channels as dch
-from channel import FileSystemCommChannel
+from .utils.channel import FileSystemCommChannel
+from .featurestore import FileSystemFeatureStore
 from dragon import fli
-from featurestore import FileSystemFeatureStore
 
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
 from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel

From 5645f79d0e9aba0e917ca95d6587e3dc93418ef2 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Tue, 6 Aug 2024 17:54:20 -0500
Subject: [PATCH 47/49] formatting fix

---
 tests/dragon/test_worker_manager.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
index 57585aac9..864e14993 100644
--- a/tests/dragon/test_worker_manager.py
+++ b/tests/dragon/test_worker_manager.py
@@ -39,8 +39,6 @@
 import os
 
 import dragon.channels as dch
-from .utils.channel import FileSystemCommChannel
-from .featurestore import FileSystemFeatureStore
 from dragon import fli
 
 from smartsim._core.mli.comm.channel.channel import CommChannelBase
@@ -57,6 +55,9 @@
 from smartsim._core.mli.message_handler import MessageHandler
 from smartsim.log import get_logger
 
+from .featurestore import FileSystemFeatureStore
+from .utils.channel import FileSystemCommChannel
+
 logger = get_logger(__name__)
 # The tests in this file belong to the dragon group
 pytestmark = pytest.mark.dragon

From 1e743156787a18ff9bcef3c1cd81aa3720a71bf5 Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 7 Aug 2024 10:10:28 -0500
Subject: [PATCH 48/49] remove asset URL overrides

---
 smartsim/_core/_cli/scripts/dragon_install.py |  16 +--
 tests/test_dragon_installer.py                | 108 +++++++++---------
 2 files changed, 55 insertions(+), 69 deletions(-)

diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index 48eb750e7..f88af4eb4 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -163,27 +163,13 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib
     # if we've previously downloaded the release and still have
     # wheels laying around, use that cached version instead
     if download_dir.exists() or list(download_dir.rglob("*.whl")):
-        # return download_dir
-        shutil.rmtree(str(download_dir))
+        return download_dir
 
     download_dir.mkdir(parents=True, exist_ok=True)
 
     # grab a copy of the complete asset
     asset_path = download_dir / str(asset.name)
     download_url = asset.browser_download_url
-    if "0.91" not in asset.name:
-        if "3.9" in python_version():
-            logger.debug("I want to snake the original w/3.9 rpm")
-            # download_url = "https://arti.hpc.amslabs.hpecorp.net/ui/native/dragon-rpm-master-local/dev/master/sle15_sp3_pe/x86_64/dragon-0.91-py3.11.5-1d600977c.rpm"
-            ...  # temp no-op
-        elif "3.10" in python_version():
-            logger.debug("snaking original w/3.10 rpm")
-            download_url = "https://drive.usercontent.google.com/download?id=1dyScGNomzoPO8-bC8i6zaIbOOhsL83Sp&export=download&authuser=0&confirm=t&uuid=6068afeb-14fd-4303-90a5-498b316d3cce&at=APZUnTWTIf9Tl7Yt8tcdKyodnydV:1722641072921"
-        elif "3.11" in python_version():
-            logger.debug("snaking original w/3.11rpm")
-            download_url = "https://drive.usercontent.google.com/download?id=1vhUXLIu06-RPA_N3wWmi42avnawzizZZ&export=download&authuser=0&confirm=t&uuid=04c920cb-2e66-4762-8e0f-8ad57e0cbbdf&at=APZUnTUKtCv_BgYOkWAaHqoPpGLd:1722640947383"
-    else:
-        logger.debug(f"the name was: {asset.name}")
 
     try:
         urlretrieve(download_url, str(asset_path))
diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py
index ea4d3eb55..4bf589ad4 100644
--- a/tests/test_dragon_installer.py
+++ b/tests/test_dragon_installer.py
@@ -156,60 +156,60 @@ def test_cleanup_archive_exists(test_archive: pathlib.Path) -> None:
     assert not test_archive.exists()
 
 
-# def test_retrieve_cached(
-#     test_dir: str,
-#     # archive_path: pathlib.Path,
-#     test_archive: pathlib.Path,
-#     monkeypatch: pytest.MonkeyPatch,
-# ) -> None:
-#     """Verify that a previously retrieved asset archive is re-used and the
-#     release asset retrieval is not attempted"""
-
-#     asset_id = 123
-
-#     def mock_webtgz_extract(self_, target_) -> None:
-#         mock_extraction_dir = pathlib.Path(target_)
-#         with tarfile.TarFile.open(test_archive) as tar:
-#             tar.extractall(mock_extraction_dir)
-
-#     # we'll use the mock extract to create the files that would normally be downloaded
-#     expected_output_dir = test_archive.parent / str(asset_id)
-#     mock_webtgz_extract(None, expected_output_dir)
-
-#     # get modification time of directory holding the "downloaded" archive
-#     ts1 = expected_output_dir.stat().st_ctime
-
-#     requester = Requester(
-#         auth=None,
-#         base_url="https://github.com",
-#         user_agent="mozilla",
-#         per_page=10,
-#         verify=False,
-#         timeout=1,
-#         retry=1,
-#         pool_size=1,
-#     )
-#     headers = {"mock-header": "mock-value"}
-#     attributes = {"mock-attr": "mock-attr-value"}
-#     completed = True
-
-#     asset = GitReleaseAsset(requester, headers, attributes, completed)
-
-#     # ensure mocked asset has values that we use...
-#     monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo"))
-#     monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name))
-#     monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id))
-
-#     # show that retrieving an asset w/a different ID results in ignoring
-#     # other wheels from prior downloads in the parent directory of the asset
-#     asset_path = retrieve_asset(test_archive.parent, asset)
-#     ts2 = asset_path.stat().st_ctime
-
-#     # NOTE: the file should be written to a subdir based on the asset ID
-#     assert (
-#         asset_path == expected_output_dir
-#     )  # shows that the expected path matches the output path
-#     assert ts1 == ts2  # show that the file wasn't changed...
+def test_retrieve_cached(
+    test_dir: str,
+    # archive_path: pathlib.Path,
+    test_archive: pathlib.Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Verify that a previously retrieved asset archive is re-used and the
+    release asset retrieval is not attempted"""
+
+    asset_id = 123
+
+    def mock_webtgz_extract(self_, target_) -> None:
+        mock_extraction_dir = pathlib.Path(target_)
+        with tarfile.TarFile.open(test_archive) as tar:
+            tar.extractall(mock_extraction_dir)
+
+    # we'll use the mock extract to create the files that would normally be downloaded
+    expected_output_dir = test_archive.parent / str(asset_id)
+    mock_webtgz_extract(None, expected_output_dir)
+
+    # get modification time of directory holding the "downloaded" archive
+    ts1 = expected_output_dir.stat().st_ctime
+
+    requester = Requester(
+        auth=None,
+        base_url="https://github.com",
+        user_agent="mozilla",
+        per_page=10,
+        verify=False,
+        timeout=1,
+        retry=1,
+        pool_size=1,
+    )
+    headers = {"mock-header": "mock-value"}
+    attributes = {"mock-attr": "mock-attr-value"}
+    completed = True
+
+    asset = GitReleaseAsset(requester, headers, attributes, completed)
+
+    # ensure mocked asset has values that we use...
+    monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo"))
+    monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name))
+    monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id))
+
+    # show that retrieving an asset w/a different ID results in ignoring
+    # other wheels from prior downloads in the parent directory of the asset
+    asset_path = retrieve_asset(test_archive.parent, asset)
+    ts2 = asset_path.stat().st_ctime
+
+    # NOTE: the file should be written to a subdir based on the asset ID
+    assert (
+        asset_path == expected_output_dir
+    )  # shows that the expected path matches the output path
+    assert ts1 == ts2  # show that the file wasn't changed...
 
 
 def test_retrieve_updated(

From 6097c46ec8d535be98f1772d69e53021154722cc Mon Sep 17 00:00:00 2001
From: ankona <3595025+ankona@users.noreply.github.com>
Date: Wed, 7 Aug 2024 12:57:04 -0500
Subject: [PATCH 49/49] remove usage of deprecated dragon policy affinity

---
 .../_core/launcher/dragon/dragonBackend.py     | 18 ++++++------------
 tests/test_dragon_run_policy.py                |  5 -----
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 16f5c03dc..4fe6d55ad 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -457,7 +457,6 @@ def create_run_policy(
         if isinstance(request, DragonRunRequest):
             run_request: DragonRunRequest = request
 
-            affinity = dragon_policy.Policy.Affinity.DEFAULT
             cpu_affinity: t.List[int] = []
             gpu_affinity: t.List[int] = []
 
@@ -465,25 +464,20 @@ def create_run_policy(
             if run_request.policy is not None:
                 # Affinities are not mutually exclusive. If specified, both are used
                 if run_request.policy.cpu_affinity:
-                    affinity = dragon_policy.Policy.Affinity.SPECIFIC
                     cpu_affinity = run_request.policy.cpu_affinity
 
                 if run_request.policy.gpu_affinity:
-                    affinity = dragon_policy.Policy.Affinity.SPECIFIC
                     gpu_affinity = run_request.policy.gpu_affinity
             logger.debug(
-                f"Affinity strategy: {affinity}, "
                 f"CPU affinity mask: {cpu_affinity}, "
                 f"GPU affinity mask: {gpu_affinity}"
             )
-            if affinity != dragon_policy.Policy.Affinity.DEFAULT:
-                return dragon_policy.Policy(
-                    placement=dragon_policy.Policy.Placement.HOST_NAME,
-                    host_name=node_name,
-                    affinity=affinity,
-                    cpu_affinity=cpu_affinity,
-                    gpu_affinity=gpu_affinity,
-                )
+            return dragon_policy.Policy(
+                placement=dragon_policy.Policy.Placement.HOST_NAME,
+                host_name=node_name,
+                cpu_affinity=cpu_affinity,
+                gpu_affinity=gpu_affinity,
+            )
 
         return dragon_policy.Policy(
             placement=dragon_policy.Policy.Placement.HOST_NAME,
diff --git a/tests/test_dragon_run_policy.py b/tests/test_dragon_run_policy.py
index 1d8d069fa..c94ae375b 100644
--- a/tests/test_dragon_run_policy.py
+++ b/tests/test_dragon_run_policy.py
@@ -143,7 +143,6 @@ def test_create_run_policy_run_request_no_run_policy() -> None:
     assert policy.device == Policy.Device.DEFAULT
     assert set(policy.cpu_affinity) == set()
     assert policy.gpu_affinity == []
-    assert policy.affinity == Policy.Affinity.DEFAULT
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -167,7 +166,6 @@ def test_create_run_policy_run_request_default_run_policy() -> None:
 
     assert set(policy.cpu_affinity) == set()
     assert set(policy.gpu_affinity) == set()
-    assert policy.affinity == Policy.Affinity.DEFAULT
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -192,7 +190,6 @@ def test_create_run_policy_run_request_cpu_affinity_no_device() -> None:
 
     assert set(policy.cpu_affinity) == affinity
     assert policy.gpu_affinity == []
-    assert policy.affinity == Policy.Affinity.SPECIFIC
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -216,7 +213,6 @@ def test_create_run_policy_run_request_cpu_affinity() -> None:
 
     assert set(policy.cpu_affinity) == affinity
     assert policy.gpu_affinity == []
-    assert policy.affinity == Policy.Affinity.SPECIFIC
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -240,7 +236,6 @@ def test_create_run_policy_run_request_gpu_affinity() -> None:
 
     assert policy.cpu_affinity == []
     assert set(policy.gpu_affinity) == set(affinity)
-    assert policy.affinity == Policy.Affinity.SPECIFIC
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")